diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9382b01beb363242364301b81b46ec44cd9a6e47 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caeb9ecd37ea8841ff58123ea81404889057d49b56520a37fa5c34926630f7be +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fa5b7d27acbfc8f5474b39813592e5ed149810d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b83e3d91302e796c449274af6a53a12b36cc8fd74a8e571a5b04f2c7a5e71d +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29b655571c9e99a22347a2efa93dee45b7bf44a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5301ef6db39cc13854a12bcb91379d626883bb3ae1195cae8ae79f759d01a67 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4630ab910148ec496a9371c15027fce0c8148456 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3f4e8e1a1b6f61ec61b812ff2ba61b0a7e5785cd590c83b6c86e91800cac88 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a85178f05a86ad889883386505dce2b07272a874 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b88a9dee27b13485971f65c54fd1abf7f03ebc80932a59b02e1688ef0cce8f +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b3d735bc4a6de4f75bc538245ee7a58adad66d89 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.62207031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.14339630587567886, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -0.296875, + "logits/rejected": 0.06689453125, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.9431640625, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.65625, + "rewards/margins": 5.5, + "rewards/rejected": 2.15625, + "step": 25, + "train_speed(iter/s)": 0.044105 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.10599759576627858, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.119140625, + "logps/chosen": -544.0, + "logps/rejected": -632.0, + "loss": 0.8330078125, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.875, + "rewards/margins": 7.8125, + "rewards/rejected": 1.03125, + "step": 30, + "train_speed(iter/s)": 0.044482 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.08272331943435543, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.462890625, + "logits/rejected": -0.1259765625, + "logps/chosen": -572.0, + "logps/rejected": -524.0, + "loss": 0.817724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.82421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 8.875, + "rewards/rejected": 0.81640625, + "step": 35, + "train_speed(iter/s)": 0.044712 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0696400325437939, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -0.15234375, + "logits/rejected": 0.044189453125, + "logps/chosen": -564.0, + "logps/rejected": -656.0, + "loss": 0.7612060546875, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.8125, + "rewards/margins": 8.3125, + "rewards/rejected": 1.4921875, + "step": 40, + "train_speed(iter/s)": 0.045037 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -0.4296875, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1200.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.65625, + "eval_nll_loss": 0.82421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.25, + "eval_rewards/margins": 6.0625, + "eval_rewards/rejected": 3.15625, + "eval_runtime": 2.5963, + "eval_samples_per_second": 1.541, + "eval_steps_per_second": 0.77, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.07254373381418182, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.474609375, + "logits/rejected": 0.1015625, + "logps/chosen": -430.0, + "logps/rejected": -684.0, + "loss": 0.793756103515625, + "memory(GiB)": 25.7, + "nll_loss": 0.828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 7.4375, + "rewards/rejected": 2.5625, + "step": 45, + "train_speed(iter/s)": 0.045165 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.09461146468300914, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -0.263671875, + "logits/rejected": 0.1494140625, + "logps/chosen": -442.0, + "logps/rejected": -572.0, + "loss": 0.829718017578125, + "memory(GiB)": 25.7, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.875, + "rewards/margins": 8.5, + "rewards/rejected": 2.40625, + "step": 50, + "train_speed(iter/s)": 0.044966 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.04766707435910784, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -0.058349609375, + "logits/rejected": -0.01220703125, + "logps/chosen": -464.0, + "logps/rejected": -496.0, + "loss": 0.75626220703125, + "memory(GiB)": 25.7, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.6875, + "rewards/rejected": 2.65625, + "step": 55, + "train_speed(iter/s)": 0.045216 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.06131605999351018, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -0.125, + "logits/rejected": 0.052490234375, + "logps/chosen": -500.0, + "logps/rejected": -504.0, + "loss": 0.786712646484375, + "memory(GiB)": 25.7, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 8.9375, + "rewards/rejected": 2.953125, + "step": 60, + "train_speed(iter/s)": 0.0454 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -0.30859375, + "eval_logits/rejected": -1.0, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.634765625, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 7.28125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5509, + "eval_samples_per_second": 1.568, + "eval_steps_per_second": 0.784, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06676936632421913, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.38671875, + "logits/rejected": -0.025146484375, + "logps/chosen": -560.0, + "logps/rejected": -676.0, + "loss": 0.79617919921875, + "memory(GiB)": 25.7, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.234375, + "step": 65, + "train_speed(iter/s)": 0.045398 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.05652873146800221, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -0.36328125, + "logits/rejected": 0.306640625, + "logps/chosen": -360.0, + "logps/rejected": -644.0, + "loss": 0.68297119140625, + "memory(GiB)": 25.7, + "nll_loss": 0.66015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5, + "rewards/margins": 8.375, + "rewards/rejected": 3.09375, + "step": 70, + "train_speed(iter/s)": 0.04555 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.06536273458529179, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -0.1416015625, + "logits/rejected": 0.011962890625, + "logps/chosen": -536.0, + "logps/rejected": -556.0, + "loss": 0.849755859375, + "memory(GiB)": 25.7, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 8.9375, + "rewards/rejected": 3.015625, + "step": 75, + "train_speed(iter/s)": 0.045382 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04825974409413595, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -0.1064453125, + "logits/rejected": -0.043701171875, + "logps/chosen": -502.0, + "logps/rejected": -458.0, + "loss": 0.706292724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 8.9375, + "rewards/rejected": 3.15625, + "step": 80, + "train_speed(iter/s)": 0.045536 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -0.2421875, + "eval_logits/rejected": -0.9765625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6259765625, + "eval_nll_loss": 0.80859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 7.875, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.4567, + "eval_samples_per_second": 1.628, + "eval_steps_per_second": 0.814, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.056385847668733294, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -0.2392578125, + "logits/rejected": 0.15625, + "logps/chosen": -604.0, + "logps/rejected": -616.0, + "loss": 0.7456787109375, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.6875, + "rewards/margins": 9.8125, + "rewards/rejected": 2.84375, + "step": 85, + "train_speed(iter/s)": 0.045586 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.05524440051382823, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -0.294921875, + "logits/rejected": 0.1328125, + "logps/chosen": -432.0, + "logps/rejected": -588.0, + "loss": 0.671728515625, + "memory(GiB)": 25.7, + "nll_loss": 0.6484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 8.9375, + "rewards/rejected": 2.859375, + "step": 90, + "train_speed(iter/s)": 0.045721 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.05977092210820939, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -0.361328125, + "logits/rejected": 0.185546875, + "logps/chosen": -366.0, + "logps/rejected": -648.0, + "loss": 0.734991455078125, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 9.0, + "rewards/rejected": 2.5625, + "step": 95, + "train_speed(iter/s)": 0.045829 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.06596771804969032, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -0.427734375, + "logits/rejected": 0.251953125, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.818841552734375, + "memory(GiB)": 25.7, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.6875, + "rewards/margins": 8.4375, + "rewards/rejected": 3.234375, + "step": 100, + "train_speed(iter/s)": 0.045708 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -0.21875, + "eval_logits/rejected": -0.9609375, + "eval_logps/chosen": -1160.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6220703125, + "eval_nll_loss": 0.8046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.25, + "eval_rewards/margins": 8.8125, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.2433, + "eval_samples_per_second": 1.783, + "eval_steps_per_second": 0.892, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9413787058176.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c952edaa4e283e48a9e5a5f345d7488cb372a5b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da89b39f2e5478257b08c125b632b90bdf712de8bbf9dc62267950fdcec4025b +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c4a36988bed07ff55d4e5b54250b738925d81c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:694ced13812778e6fc64fe42fa77fa19b969eed345458b6493ace1de30583356 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc555043d23555ec1e17c59b63e6bfdc5beeb647 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9a4d8eaa22c58e5ee5cd58da5a02de8d342fe35ddc67b92714c7d46a1d27907 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd73fc484029ac8e61a42ce1da4c0c01c8a6e8e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f50fc238799cb875ba398f5645372c169d3d5a781b4b52e2b6d28c15ca91043 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea35f6170ea4973cf2b61b37628d99dd422f06f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b692f2e44a761349843c093c6e0f9a39386a746f0716c83b76c53131568bd73 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..1ff406405418d84068458850f74aecfc6224f793 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/latest @@ -0,0 +1 @@ +global_step122 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a49f44ba05d98a84fd55c18c4fa41c6437c8853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..79ef7e8924723bd699efa313eb78103d80b7edb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40007a79aad967206b797079ca5147beff46ee1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede1043a0735266b510faa06f578fa6ef180c11e994a142a88a13ac6f33eb78b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..347f4ddca6a70c807bdc836c272752d88620574f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.62011719, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120", + "epoch": 4.96969696969697, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.14339630587567886, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -0.296875, + "logits/rejected": 0.06689453125, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.9431640625, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.65625, + "rewards/margins": 5.5, + "rewards/rejected": 2.15625, + "step": 25, + "train_speed(iter/s)": 0.044105 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.10599759576627858, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.119140625, + "logps/chosen": -544.0, + "logps/rejected": -632.0, + "loss": 0.8330078125, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.875, + "rewards/margins": 7.8125, + "rewards/rejected": 1.03125, + "step": 30, + "train_speed(iter/s)": 0.044482 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.08272331943435543, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.462890625, + "logits/rejected": -0.1259765625, + "logps/chosen": -572.0, + "logps/rejected": -524.0, + "loss": 0.817724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.82421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 8.875, + "rewards/rejected": 0.81640625, + "step": 35, + "train_speed(iter/s)": 0.044712 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0696400325437939, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -0.15234375, + "logits/rejected": 0.044189453125, + "logps/chosen": -564.0, + "logps/rejected": -656.0, + "loss": 0.7612060546875, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.8125, + "rewards/margins": 8.3125, + "rewards/rejected": 1.4921875, + "step": 40, + "train_speed(iter/s)": 0.045037 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -0.4296875, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1200.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.65625, + "eval_nll_loss": 0.82421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.25, + "eval_rewards/margins": 6.0625, + "eval_rewards/rejected": 3.15625, + "eval_runtime": 2.5963, + "eval_samples_per_second": 1.541, + "eval_steps_per_second": 0.77, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.07254373381418182, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.474609375, + "logits/rejected": 0.1015625, + "logps/chosen": -430.0, + "logps/rejected": -684.0, + "loss": 0.793756103515625, + "memory(GiB)": 25.7, + "nll_loss": 0.828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 7.4375, + "rewards/rejected": 2.5625, + "step": 45, + "train_speed(iter/s)": 0.045165 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.09461146468300914, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -0.263671875, + "logits/rejected": 0.1494140625, + "logps/chosen": -442.0, + "logps/rejected": -572.0, + "loss": 0.829718017578125, + "memory(GiB)": 25.7, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.875, + "rewards/margins": 8.5, + "rewards/rejected": 2.40625, + "step": 50, + "train_speed(iter/s)": 0.044966 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.04766707435910784, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -0.058349609375, + "logits/rejected": -0.01220703125, + "logps/chosen": -464.0, + "logps/rejected": -496.0, + "loss": 0.75626220703125, + "memory(GiB)": 25.7, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.6875, + "rewards/rejected": 2.65625, + "step": 55, + "train_speed(iter/s)": 0.045216 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.06131605999351018, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -0.125, + "logits/rejected": 0.052490234375, + "logps/chosen": -500.0, + "logps/rejected": -504.0, + "loss": 0.786712646484375, + "memory(GiB)": 25.7, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 8.9375, + "rewards/rejected": 2.953125, + "step": 60, + "train_speed(iter/s)": 0.0454 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -0.30859375, + "eval_logits/rejected": -1.0, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.634765625, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 7.28125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5509, + "eval_samples_per_second": 1.568, + "eval_steps_per_second": 0.784, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06676936632421913, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.38671875, + "logits/rejected": -0.025146484375, + "logps/chosen": -560.0, + "logps/rejected": -676.0, + "loss": 0.79617919921875, + "memory(GiB)": 25.7, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.234375, + "step": 65, + "train_speed(iter/s)": 0.045398 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.05652873146800221, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -0.36328125, + "logits/rejected": 0.306640625, + "logps/chosen": -360.0, + "logps/rejected": -644.0, + "loss": 0.68297119140625, + "memory(GiB)": 25.7, + "nll_loss": 0.66015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5, + "rewards/margins": 8.375, + "rewards/rejected": 3.09375, + "step": 70, + "train_speed(iter/s)": 0.04555 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.06536273458529179, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -0.1416015625, + "logits/rejected": 0.011962890625, + "logps/chosen": -536.0, + "logps/rejected": -556.0, + "loss": 0.849755859375, + "memory(GiB)": 25.7, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 8.9375, + "rewards/rejected": 3.015625, + "step": 75, + "train_speed(iter/s)": 0.045382 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04825974409413595, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -0.1064453125, + "logits/rejected": -0.043701171875, + "logps/chosen": -502.0, + "logps/rejected": -458.0, + "loss": 0.706292724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 8.9375, + "rewards/rejected": 3.15625, + "step": 80, + "train_speed(iter/s)": 0.045536 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -0.2421875, + "eval_logits/rejected": -0.9765625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6259765625, + "eval_nll_loss": 0.80859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 7.875, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.4567, + "eval_samples_per_second": 1.628, + "eval_steps_per_second": 0.814, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.056385847668733294, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -0.2392578125, + "logits/rejected": 0.15625, + "logps/chosen": -604.0, + "logps/rejected": -616.0, + "loss": 0.7456787109375, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.6875, + "rewards/margins": 9.8125, + "rewards/rejected": 2.84375, + "step": 85, + "train_speed(iter/s)": 0.045586 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.05524440051382823, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -0.294921875, + "logits/rejected": 0.1328125, + "logps/chosen": -432.0, + "logps/rejected": -588.0, + "loss": 0.671728515625, + "memory(GiB)": 25.7, + "nll_loss": 0.6484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 8.9375, + "rewards/rejected": 2.859375, + "step": 90, + "train_speed(iter/s)": 0.045721 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.05977092210820939, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -0.361328125, + "logits/rejected": 0.185546875, + "logps/chosen": -366.0, + "logps/rejected": -648.0, + "loss": 0.734991455078125, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 9.0, + "rewards/rejected": 2.5625, + "step": 95, + "train_speed(iter/s)": 0.045829 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.06596771804969032, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -0.427734375, + "logits/rejected": 0.251953125, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.818841552734375, + "memory(GiB)": 25.7, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.6875, + "rewards/margins": 8.4375, + "rewards/rejected": 3.234375, + "step": 100, + "train_speed(iter/s)": 0.045708 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -0.21875, + "eval_logits/rejected": -0.9609375, + "eval_logps/chosen": -1160.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6220703125, + "eval_nll_loss": 0.8046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.25, + "eval_rewards/margins": 8.8125, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.2433, + "eval_samples_per_second": 1.783, + "eval_steps_per_second": 0.892, + "step": 100 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.06608692576727743, + "learning_rate": 4.2113336672471245e-06, + "logits/chosen": -0.271484375, + "logits/rejected": 0.1005859375, + "logps/chosen": -420.0, + "logps/rejected": -552.0, + "loss": 0.71077880859375, + "memory(GiB)": 25.7, + "nll_loss": 0.671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 9.1875, + "rewards/rejected": 3.171875, + "step": 105, + "train_speed(iter/s)": 0.045758 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 0.05632288765049835, + "learning_rate": 1.8865999845374793e-06, + "logits/chosen": -0.1962890625, + "logits/rejected": 0.1875, + "logps/chosen": -708.0, + "logps/rejected": -868.0, + "loss": 0.800726318359375, + "memory(GiB)": 25.7, + "nll_loss": 0.859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 9.6875, + "rewards/rejected": 3.3125, + "step": 110, + "train_speed(iter/s)": 0.045887 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.11870440122922361, + "learning_rate": 4.738957681248379e-07, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.04638671875, + "logps/chosen": -584.0, + "logps/rejected": -592.0, + "loss": 0.803369140625, + "memory(GiB)": 25.7, + "nll_loss": 0.84375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 9.6875, + "rewards/rejected": 2.828125, + "step": 115, + "train_speed(iter/s)": 0.045978 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.07102631187789318, + "learning_rate": 0.0, + "logits/chosen": -0.07666015625, + "logits/rejected": 0.1484375, + "logps/chosen": -568.0, + "logps/rejected": -716.0, + "loss": 0.649658203125, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 10.5, + "rewards/rejected": 2.828125, + "step": 120, + "train_speed(iter/s)": 0.046069 + }, + { + "epoch": 4.96969696969697, + "eval_logits/chosen": -0.20703125, + "eval_logits/rejected": -0.953125, + "eval_logps/chosen": -1160.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6201171875, + "eval_nll_loss": 0.80078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.25, + "eval_rewards/margins": 8.625, + "eval_rewards/rejected": 4.59375, + "eval_runtime": 2.3336, + "eval_samples_per_second": 1.714, + "eval_steps_per_second": 0.857, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 11267474751488.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..66b1af8d39baa736a1177c90fa2e37d5fc673a59 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe56c02239710558a5aa6334d219b5c38c01ad9b80a15c93c31be50e423c19f2 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfeecefded4dd67e4c3d674408418d8a1fb10aa0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7a99a2d3ec435644f8bbeddad15dd160da1ad74fe5337dd9f367fcbb0f4cdc +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4feb00c7b9bfe93888f2a4e24393675fbfbd05ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4296052a3d8f89d419363542b6d153491e575d64156d1a47cb1139ecf1ef53be +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f646310720c3b9f29d5af4447359561c1c90521 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef80b66cb9665fda10e1c287bf309068c067530e8e93159875f043b452f62af6 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba8e1fd68649ba2ea2f1c48fda23861a1de441a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2921198144698255700f0b59dce87792be511022eedd048e89f00c98ab60799d +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..754671cc18290e76c0f8a83a095c9712307b7cf5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.78369141, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1935000764416.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99698abeef5c5ee62c473cef69e9af3847099e74 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:234b57f685ec13374f439b5a1d75cb4ef7948d071552d33e961daa7b0f8ef1d7 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dea09017c05f74bad0f74e7fe06272f48c32e92d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3c1e92bbf4ef0aebc975e9769fe67832b354059d653f6e0b124cf7922c808d3 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db6cb58470f2616c1fb70625b222598b9face536 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:756a4d28394fbc3dd76e6473b10b66584faf9c41120426ccf5979e09f4d29348 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0551ccd65404bbe04e0ac3f0dd0c1ca9971210f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881b8e9eae6a85ffb4aa0cab8ea5abb663a77e610afa8a1d75230186dd867ea5 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e83b853fd0419c832eaa083b512f8dbad45738f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aba0361d28a4cab0466de43f19116b16733f73caf7f61f569f06160bf9c3edb +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af30fc840bf1931c6dff2286f3b75044b149e8ea --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.65625, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.14339630587567886, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -0.296875, + "logits/rejected": 0.06689453125, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.9431640625, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.65625, + "rewards/margins": 5.5, + "rewards/rejected": 2.15625, + "step": 25, + "train_speed(iter/s)": 0.044105 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.10599759576627858, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.119140625, + "logps/chosen": -544.0, + "logps/rejected": -632.0, + "loss": 0.8330078125, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.875, + "rewards/margins": 7.8125, + "rewards/rejected": 1.03125, + "step": 30, + "train_speed(iter/s)": 0.044482 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.08272331943435543, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.462890625, + "logits/rejected": -0.1259765625, + "logps/chosen": -572.0, + "logps/rejected": -524.0, + "loss": 0.817724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.82421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 8.875, + "rewards/rejected": 0.81640625, + "step": 35, + "train_speed(iter/s)": 0.044712 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0696400325437939, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -0.15234375, + "logits/rejected": 0.044189453125, + "logps/chosen": -564.0, + "logps/rejected": -656.0, + "loss": 0.7612060546875, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.8125, + "rewards/margins": 8.3125, + "rewards/rejected": 1.4921875, + "step": 40, + "train_speed(iter/s)": 0.045037 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -0.4296875, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1200.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.65625, + "eval_nll_loss": 0.82421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.25, + "eval_rewards/margins": 6.0625, + "eval_rewards/rejected": 3.15625, + "eval_runtime": 2.5963, + "eval_samples_per_second": 1.541, + "eval_steps_per_second": 0.77, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3958362472448.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eabfd044c3371199b476d2d3079c4c6fca8da675 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93840ccef75510c3b79e0cf4d646d906eff44f0412cf99d7638d0c1ce2cb70e8 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d25bd95355cbdbf6cf3da1f25ff1dd7c71d04bb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcea323d045c9aa52f2a37d5844303dd32d02e4ab45aff3b0eed6cbd0f527a9b +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f16b1c352def34a8b31acf800fee7b019402b8e7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca3bfaa8249ab3fe84c0d34b9affadef9704e5aeb146408b115cc5175f86a26 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d66364434508a5c3748c75b1164d91f99972dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f573f7e68e47d235331723d53e0e2b94852e73ab87d18b19bf12a9a5ef66d00 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e87180594ae30c1652e34c6356ee94e7d56cc37b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353a8c6ad06e6545638a84b0ee3ff96bee9d1d72dbfec9baf13f431cec8c8b14 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..588d06c502d64135b437c25b84b4dbf651a12b87 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.63476562, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.14339630587567886, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -0.296875, + "logits/rejected": 0.06689453125, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.9431640625, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.65625, + "rewards/margins": 5.5, + "rewards/rejected": 2.15625, + "step": 25, + "train_speed(iter/s)": 0.044105 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.10599759576627858, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.119140625, + "logps/chosen": -544.0, + "logps/rejected": -632.0, + "loss": 0.8330078125, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.875, + "rewards/margins": 7.8125, + "rewards/rejected": 1.03125, + "step": 30, + "train_speed(iter/s)": 0.044482 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.08272331943435543, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.462890625, + "logits/rejected": -0.1259765625, + "logps/chosen": -572.0, + "logps/rejected": -524.0, + "loss": 0.817724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.82421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 8.875, + "rewards/rejected": 0.81640625, + "step": 35, + "train_speed(iter/s)": 0.044712 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0696400325437939, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -0.15234375, + "logits/rejected": 0.044189453125, + "logps/chosen": -564.0, + "logps/rejected": -656.0, + "loss": 0.7612060546875, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.8125, + "rewards/margins": 8.3125, + "rewards/rejected": 1.4921875, + "step": 40, + "train_speed(iter/s)": 0.045037 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -0.4296875, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1200.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.65625, + "eval_nll_loss": 0.82421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.25, + "eval_rewards/margins": 6.0625, + "eval_rewards/rejected": 3.15625, + "eval_runtime": 2.5963, + "eval_samples_per_second": 1.541, + "eval_steps_per_second": 0.77, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.07254373381418182, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.474609375, + "logits/rejected": 0.1015625, + "logps/chosen": -430.0, + "logps/rejected": -684.0, + "loss": 0.793756103515625, + "memory(GiB)": 25.7, + "nll_loss": 0.828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 7.4375, + "rewards/rejected": 2.5625, + "step": 45, + "train_speed(iter/s)": 0.045165 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.09461146468300914, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -0.263671875, + "logits/rejected": 0.1494140625, + "logps/chosen": -442.0, + "logps/rejected": -572.0, + "loss": 0.829718017578125, + "memory(GiB)": 25.7, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.875, + "rewards/margins": 8.5, + "rewards/rejected": 2.40625, + "step": 50, + "train_speed(iter/s)": 0.044966 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.04766707435910784, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -0.058349609375, + "logits/rejected": -0.01220703125, + "logps/chosen": -464.0, + "logps/rejected": -496.0, + "loss": 0.75626220703125, + "memory(GiB)": 25.7, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.6875, + "rewards/rejected": 2.65625, + "step": 55, + "train_speed(iter/s)": 0.045216 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.06131605999351018, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -0.125, + "logits/rejected": 0.052490234375, + "logps/chosen": -500.0, + "logps/rejected": -504.0, + "loss": 0.786712646484375, + "memory(GiB)": 25.7, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 8.9375, + "rewards/rejected": 2.953125, + "step": 60, + "train_speed(iter/s)": 0.0454 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -0.30859375, + "eval_logits/rejected": -1.0, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.634765625, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 7.28125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5509, + "eval_samples_per_second": 1.568, + "eval_steps_per_second": 0.784, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5664647610368.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1d48ffb291cf0e44b9b5fc7cff5a22293fd6f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9d4507822b6445fb0bae9353e22d18e7836bcd8f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0afe07530ca8a942fde97f77d23f3f23c34bbd943f4aca259882d6be85a4901 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..9b27912923a77870f523d27d8974b47fa9502849 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91f859e972bb03368271a3ab84f4f534131fb89b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12bfd2d7dd2e921ff379f644807fdb7624623bfa73aa6e5194c784f84f1d792 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6569030c765ebf154b0e55153ec9a77a9fa3520a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e939fee2c1f6ac906ee857bada39e3f69f1b62216e2d2445b8ed8676cc0bf58 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b11e2383d1fe0d9ae08f25ec5344cdf1324e262 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e1af91751f6f712831b27ccd88e9190db7cd0cb6957244cc0457a9f7f334be +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd5496587aabc679a0a739cef4cba4d2153581f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e0b21b942b6d1b79d1104c54d47ae2678c93499ec417d02cad0283144e3ea0 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a560ae37de2fb8becbb280b2ef3de20181411478 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.62597656, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.24702048103415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.140625, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.94580078125, + "memory(GiB)": 25.69, + "nll_loss": 1.5703125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027363 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.5074727724564423, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.458984375, + "logits/rejected": 0.033935546875, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4593505859375, + "memory(GiB)": 25.7, + "nll_loss": 1.515625, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0234375, + "rewards/margins": -0.042236328125, + "rewards/rejected": 0.06591796875, + "step": 5, + "train_speed(iter/s)": 0.039476 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.9228133591717713, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -0.185546875, + "logits/rejected": -0.034912109375, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 1.776171875, + "memory(GiB)": 25.7, + "nll_loss": 1.1171875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.63671875, + "rewards/rejected": 0.03369140625, + "step": 10, + "train_speed(iter/s)": 0.042464 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7395575770882112, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.07568359375, + "logps/chosen": -652.0, + "logps/rejected": -668.0, + "loss": 1.723095703125, + "memory(GiB)": 25.7, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 2.203125, + "rewards/rejected": 0.408203125, + "step": 15, + "train_speed(iter/s)": 0.043941 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.26101957851244034, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.72265625, + "logits/rejected": -0.06982421875, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 1.1149658203125, + "memory(GiB)": 25.7, + "nll_loss": 1.078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": 2.359375, + "step": 20, + "train_speed(iter/s)": 0.04473 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -0.453125, + "eval_logits/rejected": -1.0234375, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.78369140625, + "eval_nll_loss": 0.9921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.46875, + "eval_rewards/margins": 3.40625, + "eval_rewards/rejected": 3.0625, + "eval_runtime": 2.4718, + "eval_samples_per_second": 1.618, + "eval_steps_per_second": 0.809, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.14339630587567886, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -0.296875, + "logits/rejected": 0.06689453125, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.9431640625, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.65625, + "rewards/margins": 5.5, + "rewards/rejected": 2.15625, + "step": 25, + "train_speed(iter/s)": 0.044105 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.10599759576627858, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.119140625, + "logps/chosen": -544.0, + "logps/rejected": -632.0, + "loss": 0.8330078125, + "memory(GiB)": 25.7, + "nll_loss": 0.87890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.875, + "rewards/margins": 7.8125, + "rewards/rejected": 1.03125, + "step": 30, + "train_speed(iter/s)": 0.044482 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.08272331943435543, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.462890625, + "logits/rejected": -0.1259765625, + "logps/chosen": -572.0, + "logps/rejected": -524.0, + "loss": 0.817724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.82421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 8.875, + "rewards/rejected": 0.81640625, + "step": 35, + "train_speed(iter/s)": 0.044712 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0696400325437939, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -0.15234375, + "logits/rejected": 0.044189453125, + "logps/chosen": -564.0, + "logps/rejected": -656.0, + "loss": 0.7612060546875, + "memory(GiB)": 25.7, + "nll_loss": 0.8046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.8125, + "rewards/margins": 8.3125, + "rewards/rejected": 1.4921875, + "step": 40, + "train_speed(iter/s)": 0.045037 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -0.4296875, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1200.0, + "eval_logps/rejected": -362.0, + "eval_loss": 0.65625, + "eval_nll_loss": 0.82421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.25, + "eval_rewards/margins": 6.0625, + "eval_rewards/rejected": 3.15625, + "eval_runtime": 2.5963, + "eval_samples_per_second": 1.541, + "eval_steps_per_second": 0.77, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.07254373381418182, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.474609375, + "logits/rejected": 0.1015625, + "logps/chosen": -430.0, + "logps/rejected": -684.0, + "loss": 0.793756103515625, + "memory(GiB)": 25.7, + "nll_loss": 0.828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 7.4375, + "rewards/rejected": 2.5625, + "step": 45, + "train_speed(iter/s)": 0.045165 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.09461146468300914, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -0.263671875, + "logits/rejected": 0.1494140625, + "logps/chosen": -442.0, + "logps/rejected": -572.0, + "loss": 0.829718017578125, + "memory(GiB)": 25.7, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.875, + "rewards/margins": 8.5, + "rewards/rejected": 2.40625, + "step": 50, + "train_speed(iter/s)": 0.044966 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.04766707435910784, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -0.058349609375, + "logits/rejected": -0.01220703125, + "logps/chosen": -464.0, + "logps/rejected": -496.0, + "loss": 0.75626220703125, + "memory(GiB)": 25.7, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.6875, + "rewards/rejected": 2.65625, + "step": 55, + "train_speed(iter/s)": 0.045216 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.06131605999351018, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -0.125, + "logits/rejected": 0.052490234375, + "logps/chosen": -500.0, + "logps/rejected": -504.0, + "loss": 0.786712646484375, + "memory(GiB)": 25.7, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 8.9375, + "rewards/rejected": 2.953125, + "step": 60, + "train_speed(iter/s)": 0.0454 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -0.30859375, + "eval_logits/rejected": -1.0, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.634765625, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 7.28125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5509, + "eval_samples_per_second": 1.568, + "eval_steps_per_second": 0.784, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06676936632421913, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.38671875, + "logits/rejected": -0.025146484375, + "logps/chosen": -560.0, + "logps/rejected": -676.0, + "loss": 0.79617919921875, + "memory(GiB)": 25.7, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.234375, + "step": 65, + "train_speed(iter/s)": 0.045398 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.05652873146800221, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -0.36328125, + "logits/rejected": 0.306640625, + "logps/chosen": -360.0, + "logps/rejected": -644.0, + "loss": 0.68297119140625, + "memory(GiB)": 25.7, + "nll_loss": 0.66015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5, + "rewards/margins": 8.375, + "rewards/rejected": 3.09375, + "step": 70, + "train_speed(iter/s)": 0.04555 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.06536273458529179, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -0.1416015625, + "logits/rejected": 0.011962890625, + "logps/chosen": -536.0, + "logps/rejected": -556.0, + "loss": 0.849755859375, + "memory(GiB)": 25.7, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 8.9375, + "rewards/rejected": 3.015625, + "step": 75, + "train_speed(iter/s)": 0.045382 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04825974409413595, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -0.1064453125, + "logits/rejected": -0.043701171875, + "logps/chosen": -502.0, + "logps/rejected": -458.0, + "loss": 0.706292724609375, + "memory(GiB)": 25.7, + "nll_loss": 0.69921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 8.9375, + "rewards/rejected": 3.15625, + "step": 80, + "train_speed(iter/s)": 0.045536 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -0.2421875, + "eval_logits/rejected": -0.9765625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.6259765625, + "eval_nll_loss": 0.80859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 7.875, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.4567, + "eval_samples_per_second": 1.628, + "eval_steps_per_second": 0.814, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7491754524672.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8ee8e8550f700dfc127e890d756ae71a234fe0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c9971bed8fc8c96e32aeec854c7366dccfed250f1c481c02a2a99548410dab4 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..f04ff7e828091476ffe3afcf505a953d8efcd9c9 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..7df3c67e391cff04542472d8821e095f98b0c162 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..8cb3f5a0354a6246c023be5e0fe63124840f933f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..27c2274f4dc1161db860b3f700ef9755e589f105 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a69686284111b3a12abef9e2194fc0b0cac86734 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..0fe7bc241c9cfc4ac6e26a916e8d13095ca8cd59 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..8cead5875b75d617a13f1247e15b17c39a5168b6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..b402d7e83f1f954ffa5bd55810a8ab5c8f8cbe20 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..67663e05a2100599f01a9d99d76d735180d28a2f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..fce331e5d21cf2b4e9107a73199d1995d226e463 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..6306f23f3e0087c9451ee0884f663131ce70ee99 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b73568113dbe7989750ceb59e3416223f447127e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..89a42542f9e4136633fbb827fd4e67bb96a6372a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..2c72126dce5328d7e5cbf3f3ba730b04185f2ff2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..83b2eaed9f4d70fe929c4ff502a93d859c21d27b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..a138c3207804ef5be69dc8ff0d41e1abf82a564e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..64b2f7e581787ef46430fe0fea22034aa4baec12 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..a6657b880047d504f153ded9421439df630bb773 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..4bad678dd9f6ddc275e9bed64e5187a566c5b449 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..4c347ba426b47838e9e88ff185e734dda112af23 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b5bd1a3de641b133c27c194bd5620587e38ac95f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..36e8fc5258bd898f9c895e1caf79b1a3c10ba511 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..44a56f0f4354dbec75ba6aa2e9f0379682aa4b38 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..602627225a9de27d88d2f752714b702dc45b7def Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2c94620b820606c551a27525d75652b73a909937 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..233da4833a0c111961ba530bbd30b412f9c5a55d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..63e6a3feb6d3c1cad43ba3c93e4dadabe82a0189 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..4583bfc4a6a99e11ae431e4112d8c7bd7c9c8ad0 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..eb678e8a3d245610d320e35747c7d3858b1bd284 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..bbbdf63977a2dfb9dd1884174b4b3ea0a5281e0b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..d88630fea79854d554b6427f10b7e0950b443f75 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..9f5e5bd0ddca7c14c418b7c55ccc6618aa752a64 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3a9a20bc3790970c5206eb7f5fa1714dc043ec1d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/logging.jsonl b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..674ccc54f548e1cee1c2322f3f0e1764ccd1c9d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/logging.jsonl @@ -0,0 +1,33 @@ +{"loss": 1.94580078, "grad_norm": 1.24702048, "learning_rate": 1.667e-05, "memory(GiB)": 25.69, "train_speed(iter/s)": 0.027363, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.140625, "logits/chosen": -0.51953125, "nll_loss": 1.5703125, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "32s", "remaining_time": "1h 4m 17s"} +{"loss": 2.45935059, "grad_norm": 1.50747277, "learning_rate": 8.333e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.039476, "rewards/chosen": 0.0234375, "rewards/rejected": 0.06591797, "rewards/accuracies": 0.1875, "rewards/margins": -0.04223633, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.03393555, "logits/chosen": -0.45898438, "nll_loss": 1.515625, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "2m 2s", "remaining_time": "46m 57s"} +{"loss": 1.77617187, "grad_norm": 0.92281336, "learning_rate": 9.97e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.042464, "rewards/chosen": 0.66796875, "rewards/rejected": 0.03369141, "rewards/accuracies": 0.75, "rewards/margins": 0.63671875, "logps/rejected": -600.0, "logps/chosen": -636.0, "logits/rejected": -0.03491211, "logits/chosen": -0.18554688, "nll_loss": 1.1171875, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "3m 51s", "remaining_time": "42m 24s"} +{"loss": 1.7230957, "grad_norm": 0.73955758, "learning_rate": 9.847e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.043941, "rewards/chosen": 2.609375, "rewards/rejected": 0.40820312, "rewards/accuracies": 0.94999999, "rewards/margins": 2.203125, "logps/rejected": -668.0, "logps/chosen": -652.0, "logits/rejected": -0.07568359, "logits/chosen": -0.51171875, "nll_loss": 1.4140625, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "5m 37s", "remaining_time": "39m 20s"} +{"loss": 1.11496582, "grad_norm": 0.26101958, "learning_rate": 9.632e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.04473, "rewards/chosen": 5.6875, "rewards/rejected": 2.359375, "rewards/accuracies": 1.0, "rewards/margins": 3.328125, "logps/rejected": -596.0, "logps/chosen": -418.0, "logits/rejected": -0.06982422, "logits/chosen": -0.72265625, "nll_loss": 1.078125, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 22s", "remaining_time": "36m 54s"} +{"eval_loss": 0.78369141, "eval_runtime": 2.4718, "eval_samples_per_second": 1.618, "eval_steps_per_second": 0.809, "eval_rewards/chosen": 6.46875, "eval_rewards/rejected": 3.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.40625, "eval_logps/rejected": -362.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.0234375, "eval_logits/chosen": -0.453125, "eval_nll_loss": 0.9921875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 25s", "remaining_time": "37m 7s"} +{"loss": 0.94316406, "grad_norm": 0.14339631, "learning_rate": 9.33e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044105, "rewards/chosen": 7.65625, "rewards/rejected": 2.15625, "rewards/accuracies": 0.95454544, "rewards/margins": 5.5, "logps/rejected": -600.0, "logps/chosen": -552.0, "logits/rejected": 0.06689453, "logits/chosen": -0.296875, "nll_loss": 0.87890625, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "9m 22s", "remaining_time": "35m 38s"} +{"loss": 0.83300781, "grad_norm": 0.1059976, "learning_rate": 8.946e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044482, "rewards/chosen": 8.875, "rewards/rejected": 1.03125, "rewards/accuracies": 1.0, "rewards/margins": 7.8125, "logps/rejected": -632.0, "logps/chosen": -544.0, "logits/rejected": -0.11914062, "logits/chosen": -0.390625, "nll_loss": 0.87890625, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "11m 10s", "remaining_time": "33m 30s"} +{"loss": 0.81772461, "grad_norm": 0.08272332, "learning_rate": 8.486e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044712, "rewards/chosen": 9.75, "rewards/rejected": 0.81640625, "rewards/accuracies": 1.0, "rewards/margins": 8.875, "logps/rejected": -524.0, "logps/chosen": -572.0, "logits/rejected": -0.12597656, "logits/chosen": -0.46289062, "nll_loss": 0.82421875, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "12m 58s", "remaining_time": "31m 30s"} +{"loss": 0.76120605, "grad_norm": 0.06964003, "learning_rate": 7.961e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045037, "rewards/chosen": 9.8125, "rewards/rejected": 1.4921875, "rewards/accuracies": 1.0, "rewards/margins": 8.3125, "logps/rejected": -656.0, "logps/chosen": -564.0, "logits/rejected": 0.04418945, "logits/chosen": -0.15234375, "nll_loss": 0.8046875, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 44s", "remaining_time": "29m 28s"} +{"eval_loss": 0.65625, "eval_runtime": 2.5963, "eval_samples_per_second": 1.541, "eval_steps_per_second": 0.77, "eval_rewards/chosen": 9.25, "eval_rewards/rejected": 3.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.0625, "eval_logps/rejected": -362.0, "eval_logps/chosen": -1200.0, "eval_logits/rejected": -1.0390625, "eval_logits/chosen": -0.4296875, "eval_nll_loss": 0.82421875, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 46s", "remaining_time": "29m 33s"} +{"loss": 0.7937561, "grad_norm": 0.07254373, "learning_rate": 7.38e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045165, "rewards/chosen": 10.0, "rewards/rejected": 2.5625, "rewards/accuracies": 1.0, "rewards/margins": 7.4375, "logps/rejected": -684.0, "logps/chosen": -430.0, "logits/rejected": 0.1015625, "logits/chosen": -0.47460938, "nll_loss": 0.828125, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "16m 32s", "remaining_time": "27m 33s"} +{"loss": 0.82971802, "grad_norm": 0.09461146, "learning_rate": 6.753e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044966, "rewards/chosen": 10.875, "rewards/rejected": 2.40625, "rewards/accuracies": 1.0, "rewards/margins": 8.5, "logps/rejected": -572.0, "logps/chosen": -442.0, "logits/rejected": 0.14941406, "logits/chosen": -0.26367188, "nll_loss": 0.7421875, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "18m 27s", "remaining_time": "25m 50s"} +{"loss": 0.75626221, "grad_norm": 0.04766707, "learning_rate": 6.093e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045216, "rewards/chosen": 11.3125, "rewards/rejected": 2.65625, "rewards/accuracies": 1.0, "rewards/margins": 8.6875, "logps/rejected": -496.0, "logps/chosen": -464.0, "logits/rejected": -0.01220703, "logits/chosen": -0.05834961, "nll_loss": 0.75390625, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "20m 12s", "remaining_time": "23m 52s"} +{"loss": 0.78671265, "grad_norm": 0.06131606, "learning_rate": 5.413e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.0454, "rewards/chosen": 11.875, "rewards/rejected": 2.953125, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -504.0, "logps/chosen": -500.0, "logits/rejected": 0.05249023, "logits/chosen": -0.125, "nll_loss": 0.72265625, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "21m 57s", "remaining_time": "21m 57s"} +{"eval_loss": 0.63476562, "eval_runtime": 2.5509, "eval_samples_per_second": 1.568, "eval_steps_per_second": 0.784, "eval_rewards/chosen": 11.625, "eval_rewards/rejected": 4.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.28125, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0, "eval_logits/chosen": -0.30859375, "eval_nll_loss": 0.81640625, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "21m 59s", "remaining_time": "21m 59s"} +{"loss": 0.7961792, "grad_norm": 0.06676937, "learning_rate": 4.725e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045398, "rewards/chosen": 11.3125, "rewards/rejected": 3.234375, "rewards/accuracies": 1.0, "rewards/margins": 8.0625, "logps/rejected": -676.0, "logps/chosen": -560.0, "logits/rejected": -0.02514648, "logits/chosen": -0.38671875, "nll_loss": 0.7578125, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "23m 47s", "remaining_time": "20m 7s"} +{"loss": 0.68297119, "grad_norm": 0.05652873, "learning_rate": 4.041e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.04555, "rewards/chosen": 11.5, "rewards/rejected": 3.09375, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -644.0, "logps/chosen": -360.0, "logits/rejected": 0.30664062, "logits/chosen": -0.36328125, "nll_loss": 0.66015625, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "25m 32s", "remaining_time": "18m 14s"} +{"loss": 0.84975586, "grad_norm": 0.06536273, "learning_rate": 3.377e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045382, "rewards/chosen": 11.9375, "rewards/rejected": 3.015625, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -556.0, "logps/chosen": -536.0, "logits/rejected": 0.01196289, "logits/chosen": -0.14160156, "nll_loss": 0.703125, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "27m 28s", "remaining_time": "16m 29s"} +{"loss": 0.70629272, "grad_norm": 0.04825974, "learning_rate": 2.742e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045536, "rewards/chosen": 12.0625, "rewards/rejected": 3.15625, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -458.0, "logps/chosen": -502.0, "logits/rejected": -0.04370117, "logits/chosen": -0.10644531, "nll_loss": 0.69921875, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "29m 12s", "remaining_time": "14m 36s"} +{"eval_loss": 0.62597656, "eval_runtime": 2.4567, "eval_samples_per_second": 1.628, "eval_steps_per_second": 0.814, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.875, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -0.9765625, "eval_logits/chosen": -0.2421875, "eval_nll_loss": 0.80859375, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "29m 15s", "remaining_time": "14m 37s"} +{"loss": 0.74567871, "grad_norm": 0.05638585, "learning_rate": 2.151e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045586, "rewards/chosen": 12.6875, "rewards/rejected": 2.84375, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -616.0, "logps/chosen": -604.0, "logits/rejected": 0.15625, "logits/chosen": -0.23925781, "nll_loss": 0.8046875, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "31m 0s", "remaining_time": "12m 46s"} +{"loss": 0.67172852, "grad_norm": 0.0552444, "learning_rate": 1.614e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045721, "rewards/chosen": 11.75, "rewards/rejected": 2.859375, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -588.0, "logps/chosen": -432.0, "logits/rejected": 0.1328125, "logits/chosen": -0.29492188, "nll_loss": 0.6484375, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "32m 44s", "remaining_time": "10m 54s"} +{"loss": 0.73499146, "grad_norm": 0.05977092, "learning_rate": 1.14e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045829, "rewards/chosen": 11.5625, "rewards/rejected": 2.5625, "rewards/accuracies": 1.0, "rewards/margins": 9.0, "logps/rejected": -648.0, "logps/chosen": -366.0, "logits/rejected": 0.18554688, "logits/chosen": -0.36132812, "nll_loss": 0.69921875, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "34m 28s", "remaining_time": "9m 4s"} +{"loss": 0.81884155, "grad_norm": 0.06596772, "learning_rate": 7.4e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045708, "rewards/chosen": 11.6875, "rewards/rejected": 3.234375, "rewards/accuracies": 1.0, "rewards/margins": 8.4375, "logps/rejected": -620.0, "logps/chosen": -366.0, "logits/rejected": 0.25195312, "logits/chosen": -0.42773438, "nll_loss": 0.65234375, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "36m 23s", "remaining_time": "7m 16s"} +{"eval_loss": 0.62207031, "eval_runtime": 2.2433, "eval_samples_per_second": 1.783, "eval_steps_per_second": 0.892, "eval_rewards/chosen": 13.25, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.8125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1160.0, "eval_logits/rejected": -0.9609375, "eval_logits/chosen": -0.21875, "eval_nll_loss": 0.8046875, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "36m 25s", "remaining_time": "7m 17s"} +{"loss": 0.71077881, "grad_norm": 0.06608693, "learning_rate": 4.21e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045758, "rewards/chosen": 12.3125, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 9.1875, "logps/rejected": -552.0, "logps/chosen": -420.0, "logits/rejected": 0.10058594, "logits/chosen": -0.27148438, "nll_loss": 0.671875, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "38m 10s", "remaining_time": "5m 27s"} +{"loss": 0.80072632, "grad_norm": 0.05632289, "learning_rate": 1.89e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045887, "rewards/chosen": 13.0, "rewards/rejected": 3.3125, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -868.0, "logps/chosen": -708.0, "logits/rejected": 0.1875, "logits/chosen": -0.19628906, "nll_loss": 0.859375, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "39m 53s", "remaining_time": "3m 37s"} +{"loss": 0.80336914, "grad_norm": 0.1187044, "learning_rate": 4.7e-07, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045978, "rewards/chosen": 12.5, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -592.0, "logps/chosen": -584.0, "logits/rejected": -0.04638672, "logits/chosen": -0.15722656, "nll_loss": 0.84375, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "41m 37s", "remaining_time": "1m 48s"} +{"loss": 0.6496582, "grad_norm": 0.07102631, "learning_rate": 0.0, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.046069, "rewards/chosen": 13.375, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -716.0, "logps/chosen": -568.0, "logits/rejected": 0.1484375, "logits/chosen": -0.07666016, "nll_loss": 0.69921875, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 20s", "remaining_time": "0s"} +{"eval_loss": 0.62011719, "eval_runtime": 2.3336, "eval_samples_per_second": 1.714, "eval_steps_per_second": 0.857, "eval_rewards/chosen": 13.25, "eval_rewards/rejected": 4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.625, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1160.0, "eval_logits/rejected": -0.953125, "eval_logits/chosen": -0.20703125, "eval_nll_loss": 0.80078125, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 23s", "remaining_time": "0s"} +{"train_runtime": 2603.7827, "train_samples_per_second": 0.76, "train_steps_per_second": 0.046, "total_flos": 11267474751488.0, "train_loss": 0.93597488, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 23s", "remaining_time": "0s"} +{"train_dataset": "1184.691919±553.980140, min=300.000000, max=6113.000000, size=396", "val_dataset": "1183.750000±508.140421, min=717.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1786.3204M Params (9.2324M Trainable [0.5168%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/checkpoint-120", "best_metric": 0.62011719, "global_step": 120, "log_history": [{"loss": 1.94580078125, "grad_norm": 1.24702048103415, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 25.69, "train_speed(iter/s)": 0.027363, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.140625, "logits/chosen": -0.51953125, "nll_loss": 1.5703125, "epoch": 0.04040404040404041, "step": 1}, {"loss": 2.4593505859375, "grad_norm": 1.5074727724564423, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.039476, "rewards/chosen": 0.0234375, "rewards/rejected": 0.06591796875, "rewards/accuracies": 0.1875, "rewards/margins": -0.042236328125, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.033935546875, "logits/chosen": -0.458984375, "nll_loss": 1.515625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.776171875, "grad_norm": 0.9228133591717713, "learning_rate": 9.969653386589748e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.042464, "rewards/chosen": 0.66796875, "rewards/rejected": 0.03369140625, "rewards/accuracies": 0.75, "rewards/margins": 0.63671875, "logps/rejected": -600.0, "logps/chosen": -636.0, "logits/rejected": -0.034912109375, "logits/chosen": -0.185546875, "nll_loss": 1.1171875, "epoch": 0.40404040404040403, "step": 10}, {"loss": 1.723095703125, "grad_norm": 0.7395575770882112, "learning_rate": 9.847001329696653e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.043941, "rewards/chosen": 2.609375, "rewards/rejected": 0.408203125, "rewards/accuracies": 0.949999988079071, "rewards/margins": 2.203125, "logps/rejected": -668.0, "logps/chosen": -652.0, "logits/rejected": -0.07568359375, "logits/chosen": -0.51171875, "nll_loss": 1.4140625, "epoch": 0.6060606060606061, "step": 15}, {"loss": 1.1149658203125, "grad_norm": 0.26101957851244034, "learning_rate": 9.632470336074009e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.04473, "rewards/chosen": 5.6875, "rewards/rejected": 2.359375, "rewards/accuracies": 1.0, "rewards/margins": 3.328125, "logps/rejected": -596.0, "logps/chosen": -418.0, "logits/rejected": -0.06982421875, "logits/chosen": -0.72265625, "nll_loss": 1.078125, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.78369140625, "eval_runtime": 2.4718, "eval_samples_per_second": 1.618, "eval_steps_per_second": 0.809, "eval_rewards/chosen": 6.46875, "eval_rewards/rejected": 3.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.40625, "eval_logps/rejected": -362.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.0234375, "eval_logits/chosen": -0.453125, "eval_nll_loss": 0.9921875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.9431640625, "grad_norm": 0.14339630587567886, "learning_rate": 9.330127018922194e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044105, "rewards/chosen": 7.65625, "rewards/rejected": 2.15625, "rewards/accuracies": 0.9545454382896423, "rewards/margins": 5.5, "logps/rejected": -600.0, "logps/chosen": -552.0, "logits/rejected": 0.06689453125, "logits/chosen": -0.296875, "nll_loss": 0.87890625, "epoch": 1.0404040404040404, "step": 25}, {"loss": 0.8330078125, "grad_norm": 0.10599759576627858, "learning_rate": 8.945702546981969e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044482, "rewards/chosen": 8.875, "rewards/rejected": 1.03125, "rewards/accuracies": 1.0, "rewards/margins": 7.8125, "logps/rejected": -632.0, "logps/chosen": -544.0, "logits/rejected": -0.119140625, "logits/chosen": -0.390625, "nll_loss": 0.87890625, "epoch": 1.2424242424242424, "step": 30}, {"loss": 0.817724609375, "grad_norm": 0.08272331943435543, "learning_rate": 8.486484005469977e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044712, "rewards/chosen": 9.75, "rewards/rejected": 0.81640625, "rewards/accuracies": 1.0, "rewards/margins": 8.875, "logps/rejected": -524.0, "logps/chosen": -572.0, "logits/rejected": -0.1259765625, "logits/chosen": -0.462890625, "nll_loss": 0.82421875, "epoch": 1.4444444444444444, "step": 35}, {"loss": 0.7612060546875, "grad_norm": 0.0696400325437939, "learning_rate": 7.961176263324901e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045037, "rewards/chosen": 9.8125, "rewards/rejected": 1.4921875, "rewards/accuracies": 1.0, "rewards/margins": 8.3125, "logps/rejected": -656.0, "logps/chosen": -564.0, "logits/rejected": 0.044189453125, "logits/chosen": -0.15234375, "nll_loss": 0.8046875, "epoch": 1.6464646464646466, "step": 40}, {"eval_loss": 0.65625, "eval_runtime": 2.5963, "eval_samples_per_second": 1.541, "eval_steps_per_second": 0.77, "eval_rewards/chosen": 9.25, "eval_rewards/rejected": 3.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.0625, "eval_logps/rejected": -362.0, "eval_logps/chosen": -1200.0, "eval_logits/rejected": -1.0390625, "eval_logits/chosen": -0.4296875, "eval_nll_loss": 0.82421875, "epoch": 1.6464646464646466, "step": 40}, {"loss": 0.793756103515625, "grad_norm": 0.07254373381418182, "learning_rate": 7.379736965185368e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045165, "rewards/chosen": 10.0, "rewards/rejected": 2.5625, "rewards/accuracies": 1.0, "rewards/margins": 7.4375, "logps/rejected": -684.0, "logps/chosen": -430.0, "logits/rejected": 0.1015625, "logits/chosen": -0.474609375, "nll_loss": 0.828125, "epoch": 1.8484848484848486, "step": 45}, {"loss": 0.829718017578125, "grad_norm": 0.09461146468300914, "learning_rate": 6.753187775963773e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.044966, "rewards/chosen": 10.875, "rewards/rejected": 2.40625, "rewards/accuracies": 1.0, "rewards/margins": 8.5, "logps/rejected": -572.0, "logps/chosen": -442.0, "logits/rejected": 0.1494140625, "logits/chosen": -0.263671875, "nll_loss": 0.7421875, "epoch": 2.080808080808081, "step": 50}, {"loss": 0.75626220703125, "grad_norm": 0.04766707435910784, "learning_rate": 6.09340545603188e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045216, "rewards/chosen": 11.3125, "rewards/rejected": 2.65625, "rewards/accuracies": 1.0, "rewards/margins": 8.6875, "logps/rejected": -496.0, "logps/chosen": -464.0, "logits/rejected": -0.01220703125, "logits/chosen": -0.058349609375, "nll_loss": 0.75390625, "epoch": 2.282828282828283, "step": 55}, {"loss": 0.786712646484375, "grad_norm": 0.06131605999351018, "learning_rate": 5.4128967273616625e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.0454, "rewards/chosen": 11.875, "rewards/rejected": 2.953125, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -504.0, "logps/chosen": -500.0, "logits/rejected": 0.052490234375, "logits/chosen": -0.125, "nll_loss": 0.72265625, "epoch": 2.484848484848485, "step": 60}, {"eval_loss": 0.634765625, "eval_runtime": 2.5509, "eval_samples_per_second": 1.568, "eval_steps_per_second": 0.784, "eval_rewards/chosen": 11.625, "eval_rewards/rejected": 4.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.28125, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0, "eval_logits/chosen": -0.30859375, "eval_nll_loss": 0.81640625, "epoch": 2.484848484848485, "step": 60}, {"loss": 0.79617919921875, "grad_norm": 0.06676936632421913, "learning_rate": 4.7245611982206724e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045398, "rewards/chosen": 11.3125, "rewards/rejected": 3.234375, "rewards/accuracies": 1.0, "rewards/margins": 8.0625, "logps/rejected": -676.0, "logps/chosen": -560.0, "logits/rejected": -0.025146484375, "logits/chosen": -0.38671875, "nll_loss": 0.7578125, "epoch": 2.686868686868687, "step": 65}, {"loss": 0.68297119140625, "grad_norm": 0.05652873146800221, "learning_rate": 4.0414468403813095e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.04555, "rewards/chosen": 11.5, "rewards/rejected": 3.09375, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -644.0, "logps/chosen": -360.0, "logits/rejected": 0.306640625, "logits/chosen": -0.36328125, "nll_loss": 0.66015625, "epoch": 2.888888888888889, "step": 70}, {"loss": 0.849755859375, "grad_norm": 0.06536273458529179, "learning_rate": 3.3765026539765834e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045382, "rewards/chosen": 11.9375, "rewards/rejected": 3.015625, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -556.0, "logps/chosen": -536.0, "logits/rejected": 0.011962890625, "logits/chosen": -0.1416015625, "nll_loss": 0.703125, "epoch": 3.121212121212121, "step": 75}, {"loss": 0.706292724609375, "grad_norm": 0.04825974409413595, "learning_rate": 2.7423332084455544e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045536, "rewards/chosen": 12.0625, "rewards/rejected": 3.15625, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -458.0, "logps/chosen": -502.0, "logits/rejected": -0.043701171875, "logits/chosen": -0.1064453125, "nll_loss": 0.69921875, "epoch": 3.323232323232323, "step": 80}, {"eval_loss": 0.6259765625, "eval_runtime": 2.4567, "eval_samples_per_second": 1.628, "eval_steps_per_second": 0.814, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.875, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -0.9765625, "eval_logits/chosen": -0.2421875, "eval_nll_loss": 0.80859375, "epoch": 3.323232323232323, "step": 80}, {"loss": 0.7456787109375, "grad_norm": 0.056385847668733294, "learning_rate": 2.150959712448669e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045586, "rewards/chosen": 12.6875, "rewards/rejected": 2.84375, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -616.0, "logps/chosen": -604.0, "logits/rejected": 0.15625, "logits/chosen": -0.2392578125, "nll_loss": 0.8046875, "epoch": 3.525252525252525, "step": 85}, {"loss": 0.671728515625, "grad_norm": 0.05524440051382823, "learning_rate": 1.6135921418712956e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045721, "rewards/chosen": 11.75, "rewards/rejected": 2.859375, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -588.0, "logps/chosen": -432.0, "logits/rejected": 0.1328125, "logits/chosen": -0.294921875, "nll_loss": 0.6484375, "epoch": 3.7272727272727275, "step": 90}, {"loss": 0.734991455078125, "grad_norm": 0.05977092210820939, "learning_rate": 1.1404167454183957e-05, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045829, "rewards/chosen": 11.5625, "rewards/rejected": 2.5625, "rewards/accuracies": 1.0, "rewards/margins": 9.0, "logps/rejected": -648.0, "logps/chosen": -366.0, "logits/rejected": 0.185546875, "logits/chosen": -0.361328125, "nll_loss": 0.69921875, "epoch": 3.929292929292929, "step": 95}, {"loss": 0.818841552734375, "grad_norm": 0.06596771804969032, "learning_rate": 7.404029558083653e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045708, "rewards/chosen": 11.6875, "rewards/rejected": 3.234375, "rewards/accuracies": 1.0, "rewards/margins": 8.4375, "logps/rejected": -620.0, "logps/chosen": -366.0, "logits/rejected": 0.251953125, "logits/chosen": -0.427734375, "nll_loss": 0.65234375, "epoch": 4.161616161616162, "step": 100}, {"eval_loss": 0.6220703125, "eval_runtime": 2.2433, "eval_samples_per_second": 1.783, "eval_steps_per_second": 0.892, "eval_rewards/chosen": 13.25, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.8125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1160.0, "eval_logits/rejected": -0.9609375, "eval_logits/chosen": -0.21875, "eval_nll_loss": 0.8046875, "epoch": 4.161616161616162, "step": 100}, {"loss": 0.71077880859375, "grad_norm": 0.06608692576727743, "learning_rate": 4.2113336672471245e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045758, "rewards/chosen": 12.3125, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 9.1875, "logps/rejected": -552.0, "logps/chosen": -420.0, "logits/rejected": 0.1005859375, "logits/chosen": -0.271484375, "nll_loss": 0.671875, "epoch": 4.363636363636363, "step": 105}, {"loss": 0.800726318359375, "grad_norm": 0.05632288765049835, "learning_rate": 1.8865999845374793e-06, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045887, "rewards/chosen": 13.0, "rewards/rejected": 3.3125, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -868.0, "logps/chosen": -708.0, "logits/rejected": 0.1875, "logits/chosen": -0.1962890625, "nll_loss": 0.859375, "epoch": 4.565656565656566, "step": 110}, {"loss": 0.803369140625, "grad_norm": 0.11870440122922361, "learning_rate": 4.738957681248379e-07, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.045978, "rewards/chosen": 12.5, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -592.0, "logps/chosen": -584.0, "logits/rejected": -0.04638671875, "logits/chosen": -0.1572265625, "nll_loss": 0.84375, "epoch": 4.767676767676767, "step": 115}, {"loss": 0.649658203125, "grad_norm": 0.07102631187789318, "learning_rate": 0.0, "memory(GiB)": 25.7, "train_speed(iter/s)": 0.046069, "rewards/chosen": 13.375, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -716.0, "logps/chosen": -568.0, "logits/rejected": 0.1484375, "logits/chosen": -0.07666015625, "nll_loss": 0.69921875, "epoch": 4.96969696969697, "step": 120}, {"eval_loss": 0.6201171875, "eval_runtime": 2.3336, "eval_samples_per_second": 1.714, "eval_steps_per_second": 0.857, "eval_rewards/chosen": 13.25, "eval_rewards/rejected": 4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.625, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1160.0, "eval_logits/rejected": -0.953125, "eval_logits/chosen": -0.20703125, "eval_nll_loss": 0.80078125, "epoch": 4.96969696969697, "step": 120}, {"train_runtime": 2603.7827, "train_samples_per_second": 0.76, "train_steps_per_second": 0.046, "total_flos": 11267474751488.0, "train_loss": 0.9359748840332032, "epoch": 4.96969696969697, "step": 120}], "memory": 25.6953125} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs/events.out.tfevents.1737743914.kml-dtmachine-18088-prod.31276.0 b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs/events.out.tfevents.1737743914.kml-dtmachine-18088-prod.31276.0 new file mode 100644 index 0000000000000000000000000000000000000000..e8ade054183e7bc44e5ad3b0b6be1e18774f61ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-183757/runs/events.out.tfevents.1737743914.kml-dtmachine-18088-prod.31276.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a46769cc77383bc204a75582746e4a952b0e147f4c468c825506b0b7a49f52 +size 33774 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..419d69de66ea1ec23c49d73924c1dea291f94131 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d611e889956ec6b5e70952d3bf31141dfd37cf447b9b1f10badfebaaf325b1a7 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e627a3f82dfaa978767c0ca2f6b3b999c6dd9838 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b4782ef12817d6b6b63841d5f3d973c139e84c1924aa6eea3b057b81bcee0d0 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5c06b9189681b0cbd41ce34da60c5322827ebed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6241c209528a8307630f782f441547db7b122bd9b678549bfa564db30fe7895 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4630ab910148ec496a9371c15027fce0c8148456 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3f4e8e1a1b6f61ec61b812ff2ba61b0a7e5785cd590c83b6c86e91800cac88 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a85178f05a86ad889883386505dce2b07272a874 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b88a9dee27b13485971f65c54fd1abf7f03ebc80932a59b02e1688ef0cce8f +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef9c04c3f96c1e0e71cd212fb4ff6ed37dbda449 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.63623047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.542498800867474, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.1005859375, + "logits/rejected": 0.1279296875, + "logps/chosen": -552.0, + "logps/rejected": -568.0, + "loss": 1.2177978515625, + "memory(GiB)": 25.88, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.25, + "rewards/margins": 2.15625, + "rewards/rejected": 5.09375, + "step": 25, + "train_speed(iter/s)": 0.043339 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.265838833128634, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.0966796875, + "logits/rejected": -0.09716796875, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.92021484375, + "memory(GiB)": 25.88, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5, + "rewards/rejected": 4.375, + "step": 30, + "train_speed(iter/s)": 0.043908 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1506666851461901, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.08984375, + "logits/rejected": -0.08984375, + "logps/chosen": -576.0, + "logps/rejected": -484.0, + "loss": 0.8673828125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 4.9375, + "rewards/rejected": 4.71875, + "step": 35, + "train_speed(iter/s)": 0.044234 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.07923738211951831, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.3203125, + "logits/rejected": 0.07275390625, + "logps/chosen": -564.0, + "logps/rejected": -632.0, + "loss": 0.78349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.8203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.90625, + "rewards/rejected": 4.09375, + "step": 40, + "train_speed(iter/s)": 0.044579 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.1484375, + "eval_logits/rejected": -1.0859375, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.75, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.6003, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 0.769, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.09411209765685535, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.05322265625, + "logits/rejected": 0.1015625, + "logps/chosen": -428.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 6.0, + "rewards/rejected": 4.1875, + "step": 45, + "train_speed(iter/s)": 0.044378 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12369237367680226, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09228515625, + "logits/rejected": 0.1044921875, + "logps/chosen": -446.0, + "logps/rejected": -560.0, + "loss": 0.85614013671875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 6.8125, + "rewards/rejected": 3.609375, + "step": 50, + "train_speed(iter/s)": 0.044098 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.0681210790824193, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1689453125, + "logits/rejected": -0.07568359375, + "logps/chosen": -468.0, + "logps/rejected": -482.0, + "loss": 0.7699462890625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.9375, + "rewards/margins": 7.0, + "rewards/rejected": 3.921875, + "step": 55, + "train_speed(iter/s)": 0.04427 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.09556035300668257, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.0023040771484375, + "logps/chosen": -502.0, + "logps/rejected": -494.0, + "loss": 0.7938720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.53125, + "rewards/rejected": 3.875, + "step": 60, + "train_speed(iter/s)": 0.044449 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0625, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.654296875, + "eval_nll_loss": 0.83984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 4.34375, + "eval_runtime": 2.5187, + "eval_samples_per_second": 1.588, + "eval_steps_per_second": 0.794, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0731266213113978, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01507568359375, + "logits/rejected": -0.07421875, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.803509521484375, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 6.84375, + "rewards/rejected": 4.59375, + "step": 65, + "train_speed(iter/s)": 0.04445 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0688101140360203, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.076171875, + "logits/rejected": 0.318359375, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.69508056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 7.09375, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.044511 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07652393042135892, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09716796875, + "logits/rejected": -0.045166015625, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.8555908203125, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.6875, + "rewards/rejected": 4.15625, + "step": 75, + "train_speed(iter/s)": 0.044231 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.055938241299950876, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.2109375, + "logits/rejected": -0.1103515625, + "logps/chosen": -502.0, + "logps/rejected": -444.0, + "loss": 0.715106201171875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.5625, + "rewards/rejected": 4.28125, + "step": 80, + "train_speed(iter/s)": 0.044286 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.271484375, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.6416015625, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.3125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5165, + "eval_samples_per_second": 1.59, + "eval_steps_per_second": 0.795, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.06568256120217658, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": 0.1865234375, + "logits/rejected": 0.09423828125, + "logps/chosen": -608.0, + "logps/rejected": -608.0, + "loss": 0.7561279296875, + "memory(GiB)": 25.88, + "nll_loss": 0.8125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.125, + "rewards/margins": 7.96875, + "rewards/rejected": 4.15625, + "step": 85, + "train_speed(iter/s)": 0.044291 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.06612494124440302, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": 0.09375, + "logits/rejected": 0.09033203125, + "logps/chosen": -434.0, + "logps/rejected": -576.0, + "loss": 0.67840576171875, + "memory(GiB)": 25.88, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.625, + "rewards/margins": 7.78125, + "rewards/rejected": 3.859375, + "step": 90, + "train_speed(iter/s)": 0.043843 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.07071323350474046, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": 0.052490234375, + "logits/rejected": 0.1435546875, + "logps/chosen": -368.0, + "logps/rejected": -632.0, + "loss": 0.749688720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 7.625, + "rewards/rejected": 3.59375, + "step": 95, + "train_speed(iter/s)": 0.043933 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.06933003179796686, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -0.007476806640625, + "logits/rejected": 0.21875, + "logps/chosen": -364.0, + "logps/rejected": -608.0, + "loss": 0.828558349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.625, + "rewards/margins": 7.53125, + "rewards/rejected": 4.09375, + "step": 100, + "train_speed(iter/s)": 0.043751 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": 0.287109375, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.63623046875, + "eval_nll_loss": 0.8125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.125, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 2.6531, + "eval_samples_per_second": 1.508, + "eval_steps_per_second": 0.754, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9336058478592.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa2fad7f2224f80d7e1e8a3d488e19d13e8e37f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f182fefc43ebd6b429c6ba2f55b2a0ef8ec9c49cd116961aabe58d5a3bd7e775 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8516f857ba7e27ed9927d3646758c14c81adf47f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d423472e5fd48b4e8baa237b44037d528cc87755a68fe4947ed43ecd4e9789 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cd1ed984ae24d1389da49e9b59e7af4e66c809b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6003753004e3a0d4f75bcdb0786b6585909999c15c5cc630d1b0d66da361c763 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd73fc484029ac8e61a42ce1da4c0c01c8a6e8e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f50fc238799cb875ba398f5645372c169d3d5a781b4b52e2b6d28c15ca91043 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea35f6170ea4973cf2b61b37628d99dd422f06f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b692f2e44a761349843c093c6e0f9a39386a746f0716c83b76c53131568bd73 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..1ff406405418d84068458850f74aecfc6224f793 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/latest @@ -0,0 +1 @@ +global_step122 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a49f44ba05d98a84fd55c18c4fa41c6437c8853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..79ef7e8924723bd699efa313eb78103d80b7edb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40007a79aad967206b797079ca5147beff46ee1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede1043a0735266b510faa06f578fa6ef180c11e994a142a88a13ac6f33eb78b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fadd30f7b8e1928a743440dc79140f33b99b8d62 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.63623047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100", + "epoch": 4.96969696969697, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.542498800867474, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.1005859375, + "logits/rejected": 0.1279296875, + "logps/chosen": -552.0, + "logps/rejected": -568.0, + "loss": 1.2177978515625, + "memory(GiB)": 25.88, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.25, + "rewards/margins": 2.15625, + "rewards/rejected": 5.09375, + "step": 25, + "train_speed(iter/s)": 0.043339 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.265838833128634, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.0966796875, + "logits/rejected": -0.09716796875, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.92021484375, + "memory(GiB)": 25.88, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5, + "rewards/rejected": 4.375, + "step": 30, + "train_speed(iter/s)": 0.043908 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1506666851461901, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.08984375, + "logits/rejected": -0.08984375, + "logps/chosen": -576.0, + "logps/rejected": -484.0, + "loss": 0.8673828125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 4.9375, + "rewards/rejected": 4.71875, + "step": 35, + "train_speed(iter/s)": 0.044234 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.07923738211951831, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.3203125, + "logits/rejected": 0.07275390625, + "logps/chosen": -564.0, + "logps/rejected": -632.0, + "loss": 0.78349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.8203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.90625, + "rewards/rejected": 4.09375, + "step": 40, + "train_speed(iter/s)": 0.044579 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.1484375, + "eval_logits/rejected": -1.0859375, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.75, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.6003, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 0.769, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.09411209765685535, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.05322265625, + "logits/rejected": 0.1015625, + "logps/chosen": -428.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 6.0, + "rewards/rejected": 4.1875, + "step": 45, + "train_speed(iter/s)": 0.044378 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12369237367680226, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09228515625, + "logits/rejected": 0.1044921875, + "logps/chosen": -446.0, + "logps/rejected": -560.0, + "loss": 0.85614013671875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 6.8125, + "rewards/rejected": 3.609375, + "step": 50, + "train_speed(iter/s)": 0.044098 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.0681210790824193, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1689453125, + "logits/rejected": -0.07568359375, + "logps/chosen": -468.0, + "logps/rejected": -482.0, + "loss": 0.7699462890625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.9375, + "rewards/margins": 7.0, + "rewards/rejected": 3.921875, + "step": 55, + "train_speed(iter/s)": 0.04427 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.09556035300668257, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.0023040771484375, + "logps/chosen": -502.0, + "logps/rejected": -494.0, + "loss": 0.7938720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.53125, + "rewards/rejected": 3.875, + "step": 60, + "train_speed(iter/s)": 0.044449 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0625, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.654296875, + "eval_nll_loss": 0.83984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 4.34375, + "eval_runtime": 2.5187, + "eval_samples_per_second": 1.588, + "eval_steps_per_second": 0.794, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0731266213113978, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01507568359375, + "logits/rejected": -0.07421875, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.803509521484375, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 6.84375, + "rewards/rejected": 4.59375, + "step": 65, + "train_speed(iter/s)": 0.04445 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0688101140360203, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.076171875, + "logits/rejected": 0.318359375, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.69508056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 7.09375, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.044511 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07652393042135892, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09716796875, + "logits/rejected": -0.045166015625, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.8555908203125, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.6875, + "rewards/rejected": 4.15625, + "step": 75, + "train_speed(iter/s)": 0.044231 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.055938241299950876, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.2109375, + "logits/rejected": -0.1103515625, + "logps/chosen": -502.0, + "logps/rejected": -444.0, + "loss": 0.715106201171875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.5625, + "rewards/rejected": 4.28125, + "step": 80, + "train_speed(iter/s)": 0.044286 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.271484375, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.6416015625, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.3125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5165, + "eval_samples_per_second": 1.59, + "eval_steps_per_second": 0.795, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.06568256120217658, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": 0.1865234375, + "logits/rejected": 0.09423828125, + "logps/chosen": -608.0, + "logps/rejected": -608.0, + "loss": 0.7561279296875, + "memory(GiB)": 25.88, + "nll_loss": 0.8125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.125, + "rewards/margins": 7.96875, + "rewards/rejected": 4.15625, + "step": 85, + "train_speed(iter/s)": 0.044291 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.06612494124440302, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": 0.09375, + "logits/rejected": 0.09033203125, + "logps/chosen": -434.0, + "logps/rejected": -576.0, + "loss": 0.67840576171875, + "memory(GiB)": 25.88, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.625, + "rewards/margins": 7.78125, + "rewards/rejected": 3.859375, + "step": 90, + "train_speed(iter/s)": 0.043843 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.07071323350474046, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": 0.052490234375, + "logits/rejected": 0.1435546875, + "logps/chosen": -368.0, + "logps/rejected": -632.0, + "loss": 0.749688720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 7.625, + "rewards/rejected": 3.59375, + "step": 95, + "train_speed(iter/s)": 0.043933 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.06933003179796686, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -0.007476806640625, + "logits/rejected": 0.21875, + "logps/chosen": -364.0, + "logps/rejected": -608.0, + "loss": 0.828558349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.625, + "rewards/margins": 7.53125, + "rewards/rejected": 4.09375, + "step": 100, + "train_speed(iter/s)": 0.043751 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": 0.287109375, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.63623046875, + "eval_nll_loss": 0.8125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.125, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 2.6531, + "eval_samples_per_second": 1.508, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.0811559898134451, + "learning_rate": 4.2113336672471245e-06, + "logits/chosen": 0.061279296875, + "logits/rejected": 0.05810546875, + "logps/chosen": -422.0, + "logps/rejected": -540.0, + "loss": 0.7190185546875, + "memory(GiB)": 25.88, + "nll_loss": 0.6796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 7.78125, + "rewards/rejected": 4.4375, + "step": 105, + "train_speed(iter/s)": 0.043728 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 0.062097460024984424, + "learning_rate": 1.8865999845374793e-06, + "logits/chosen": 0.1357421875, + "logits/rejected": 0.134765625, + "logps/chosen": -712.0, + "logps/rejected": -856.0, + "loss": 0.81259765625, + "memory(GiB)": 25.88, + "nll_loss": 0.875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 8.375, + "rewards/rejected": 4.125, + "step": 110, + "train_speed(iter/s)": 0.043777 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.12783918074585626, + "learning_rate": 4.738957681248379e-07, + "logits/chosen": 0.162109375, + "logits/rejected": -0.11572265625, + "logps/chosen": -584.0, + "logps/rejected": -580.0, + "loss": 0.81141357421875, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 8.1875, + "rewards/rejected": 4.03125, + "step": 115, + "train_speed(iter/s)": 0.043804 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.0838890904333176, + "learning_rate": 0.0, + "logits/chosen": 0.21875, + "logits/rejected": 0.1015625, + "logps/chosen": -576.0, + "logps/rejected": -704.0, + "loss": 0.65443115234375, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5625, + "rewards/margins": 8.75, + "rewards/rejected": 3.796875, + "step": 120, + "train_speed(iter/s)": 0.043853 + }, + { + "epoch": 4.96969696969697, + "eval_logits/chosen": 0.28515625, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.63720703125, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.125, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 2.4599, + "eval_samples_per_second": 1.626, + "eval_steps_per_second": 0.813, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 11174281019392.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7db96c067c4c262314407e6dfb469c395e992593 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b93d89498f798adc84f4297fd6bd3c18c6965b6e7bd9d4c2b6bcfe9e4a23eda +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfa26567df05845ad8c4615e776eddbdbf50c706 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2faf16dfefcd19c41c47354019ad039c68c81a8eb727df15cd4738f5ee0d618 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2b07974e1d1a67f6674da064417d0c8b71dbaae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c57d8c9d0dc811eddd9a7e0b065c58041ff3e6db4f8188f097fc422403f3318 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f646310720c3b9f29d5af4447359561c1c90521 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef80b66cb9665fda10e1c287bf309068c067530e8e93159875f043b452f62af6 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba8e1fd68649ba2ea2f1c48fda23861a1de441a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2921198144698255700f0b59dce87792be511022eedd048e89f00c98ab60799d +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e53132e58acd594a4a42e114a9b6416397850b96 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 1.14257812, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1917401104384.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..526dc36ce19677e56ddc4f5577f1f7f79dfb7647 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91e776f6dd515e5fbafd1bb4b95c3f6afb4d631f674185a90ca84d5ed60fb2 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14526c425917481b390dc83b293db18169910c8b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c83af3d38a6d4916cd01785931e76e99553ace4ce02d3411b9db792b4d0b0e5 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed6b5662fdc4c3e5269d1ee9a961bde3b04915ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5982496bfbe0b38cd0a72a4a714841c9fed2e98871f25c0ad37895f7d55a2ed3 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0551ccd65404bbe04e0ac3f0dd0c1ca9971210f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881b8e9eae6a85ffb4aa0cab8ea5abb663a77e610afa8a1d75230186dd867ea5 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e83b853fd0419c832eaa083b512f8dbad45738f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aba0361d28a4cab0466de43f19116b16733f73caf7f61f569f06160bf9c3edb +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6f02779dc17e7a8f208ce93da8bd98e296f78af0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.66748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.542498800867474, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.1005859375, + "logits/rejected": 0.1279296875, + "logps/chosen": -552.0, + "logps/rejected": -568.0, + "loss": 1.2177978515625, + "memory(GiB)": 25.88, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.25, + "rewards/margins": 2.15625, + "rewards/rejected": 5.09375, + "step": 25, + "train_speed(iter/s)": 0.043339 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.265838833128634, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.0966796875, + "logits/rejected": -0.09716796875, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.92021484375, + "memory(GiB)": 25.88, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5, + "rewards/rejected": 4.375, + "step": 30, + "train_speed(iter/s)": 0.043908 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1506666851461901, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.08984375, + "logits/rejected": -0.08984375, + "logps/chosen": -576.0, + "logps/rejected": -484.0, + "loss": 0.8673828125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 4.9375, + "rewards/rejected": 4.71875, + "step": 35, + "train_speed(iter/s)": 0.044234 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.07923738211951831, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.3203125, + "logits/rejected": 0.07275390625, + "logps/chosen": -564.0, + "logps/rejected": -632.0, + "loss": 0.78349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.8203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.90625, + "rewards/rejected": 4.09375, + "step": 40, + "train_speed(iter/s)": 0.044579 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.1484375, + "eval_logits/rejected": -1.0859375, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.75, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.6003, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 0.769, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3927512711168.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05cde98a55c5daec1c2f6be1a8eb183ca31c0881 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f785b61667fc206a559d0b2be63230fede2294f4b443efa3e1e2287da2e7c8eb +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86a79aa313300df71bf217dc2980a0de31765d8d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ac1191d8d79882fc783293e120edffba8ebacfbc67e2ed06f120f6bdd81f9d +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef26c81116d0079c1da43b843165b9cb6fb74329 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82efb417cff064f95f90050232e69c0a4c695c72c9d09b45f9535f7af09e106 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d66364434508a5c3748c75b1164d91f99972dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f573f7e68e47d235331723d53e0e2b94852e73ab87d18b19bf12a9a5ef66d00 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e87180594ae30c1652e34c6356ee94e7d56cc37b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353a8c6ad06e6545638a84b0ee3ff96bee9d1d72dbfec9baf13f431cec8c8b14 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f7e3749dc25fdf6ddacaf49263f1afeef88404b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.65429688, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.542498800867474, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.1005859375, + "logits/rejected": 0.1279296875, + "logps/chosen": -552.0, + "logps/rejected": -568.0, + "loss": 1.2177978515625, + "memory(GiB)": 25.88, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.25, + "rewards/margins": 2.15625, + "rewards/rejected": 5.09375, + "step": 25, + "train_speed(iter/s)": 0.043339 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.265838833128634, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.0966796875, + "logits/rejected": -0.09716796875, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.92021484375, + "memory(GiB)": 25.88, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5, + "rewards/rejected": 4.375, + "step": 30, + "train_speed(iter/s)": 0.043908 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1506666851461901, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.08984375, + "logits/rejected": -0.08984375, + "logps/chosen": -576.0, + "logps/rejected": -484.0, + "loss": 0.8673828125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 4.9375, + "rewards/rejected": 4.71875, + "step": 35, + "train_speed(iter/s)": 0.044234 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.07923738211951831, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.3203125, + "logits/rejected": 0.07275390625, + "logps/chosen": -564.0, + "logps/rejected": -632.0, + "loss": 0.78349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.8203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.90625, + "rewards/rejected": 4.09375, + "step": 40, + "train_speed(iter/s)": 0.044579 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.1484375, + "eval_logits/rejected": -1.0859375, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.75, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.6003, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 0.769, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.09411209765685535, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.05322265625, + "logits/rejected": 0.1015625, + "logps/chosen": -428.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 6.0, + "rewards/rejected": 4.1875, + "step": 45, + "train_speed(iter/s)": 0.044378 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12369237367680226, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09228515625, + "logits/rejected": 0.1044921875, + "logps/chosen": -446.0, + "logps/rejected": -560.0, + "loss": 0.85614013671875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 6.8125, + "rewards/rejected": 3.609375, + "step": 50, + "train_speed(iter/s)": 0.044098 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.0681210790824193, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1689453125, + "logits/rejected": -0.07568359375, + "logps/chosen": -468.0, + "logps/rejected": -482.0, + "loss": 0.7699462890625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.9375, + "rewards/margins": 7.0, + "rewards/rejected": 3.921875, + "step": 55, + "train_speed(iter/s)": 0.04427 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.09556035300668257, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.0023040771484375, + "logps/chosen": -502.0, + "logps/rejected": -494.0, + "loss": 0.7938720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.53125, + "rewards/rejected": 3.875, + "step": 60, + "train_speed(iter/s)": 0.044449 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0625, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.654296875, + "eval_nll_loss": 0.83984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 4.34375, + "eval_runtime": 2.5187, + "eval_samples_per_second": 1.588, + "eval_steps_per_second": 0.794, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5616157884416.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4c0c38ef8ad9ea8e056b6d01d8d5702e44ace6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fb7fb2637df956dae99be180446249960a8454ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a888971df7733b54c1c7cf136e37ab1a85ff7e6ea1fca6adb292be23bb38ae57 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..5264e9db3dfade20e4af290c5d260b7b1e4bd0a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa0bd76f8cdc440b51d10f3bac4997d6776fc7bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:015237a4fd464abb48a6fbfa7120e3f7dec24149b07bc3fedcd25cb076ff81cf +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc9f1e83655cf9526696e6b11afcb3aad3bc4a35 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4af63fa7436b20e7df6aa59cb1b9a551131b63b48f0f6a2ae3ecdaff4b6b44 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b11e2383d1fe0d9ae08f25ec5344cdf1324e262 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e1af91751f6f712831b27ccd88e9190db7cd0cb6957244cc0457a9f7f334be +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd5496587aabc679a0a739cef4cba4d2153581f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e0b21b942b6d1b79d1104c54d47ae2678c93499ec417d02cad0283144e3ea0 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f9b297184d93ead21caa21d8ef3e86e8f7174030 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.64160156, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.204306976098353, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.028637 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.4247437031946648, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.083984375, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4766845703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.21875, + "rewards/chosen": 0.03125, + "rewards/margins": -7.62939453125e-06, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.040155 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2234841967992873, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1318359375, + "logits/rejected": 0.01361083984375, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.945703125, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.546875, + "step": 10, + "train_speed(iter/s)": 0.042924 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7354660931188922, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1533203125, + "logits/rejected": -0.021240234375, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.1078125, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 2.609375, + "rewards/margins": 0.357421875, + "rewards/rejected": 2.25, + "step": 15, + "train_speed(iter/s)": 0.043724 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7411526143175573, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.07958984375, + "logits/rejected": 0.0103759765625, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.6548828125, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 5.09375, + "rewards/margins": 0.8828125, + "rewards/rejected": 4.21875, + "step": 20, + "train_speed(iter/s)": 0.044054 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.232421875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.142578125, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.25, + "eval_rewards/rejected": 5.5, + "eval_runtime": 2.5109, + "eval_samples_per_second": 1.593, + "eval_steps_per_second": 0.797, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.542498800867474, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.1005859375, + "logits/rejected": 0.1279296875, + "logps/chosen": -552.0, + "logps/rejected": -568.0, + "loss": 1.2177978515625, + "memory(GiB)": 25.88, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.25, + "rewards/margins": 2.15625, + "rewards/rejected": 5.09375, + "step": 25, + "train_speed(iter/s)": 0.043339 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.265838833128634, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.0966796875, + "logits/rejected": -0.09716796875, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.92021484375, + "memory(GiB)": 25.88, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5, + "rewards/rejected": 4.375, + "step": 30, + "train_speed(iter/s)": 0.043908 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1506666851461901, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.08984375, + "logits/rejected": -0.08984375, + "logps/chosen": -576.0, + "logps/rejected": -484.0, + "loss": 0.8673828125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 4.9375, + "rewards/rejected": 4.71875, + "step": 35, + "train_speed(iter/s)": 0.044234 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.07923738211951831, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.3203125, + "logits/rejected": 0.07275390625, + "logps/chosen": -564.0, + "logps/rejected": -632.0, + "loss": 0.78349609375, + "memory(GiB)": 25.88, + "nll_loss": 0.8203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.90625, + "rewards/rejected": 4.09375, + "step": 40, + "train_speed(iter/s)": 0.044579 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.1484375, + "eval_logits/rejected": -1.0859375, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.75, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 2.6003, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 0.769, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.09411209765685535, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.05322265625, + "logits/rejected": 0.1015625, + "logps/chosen": -428.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 6.0, + "rewards/rejected": 4.1875, + "step": 45, + "train_speed(iter/s)": 0.044378 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12369237367680226, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09228515625, + "logits/rejected": 0.1044921875, + "logps/chosen": -446.0, + "logps/rejected": -560.0, + "loss": 0.85614013671875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 6.8125, + "rewards/rejected": 3.609375, + "step": 50, + "train_speed(iter/s)": 0.044098 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.0681210790824193, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1689453125, + "logits/rejected": -0.07568359375, + "logps/chosen": -468.0, + "logps/rejected": -482.0, + "loss": 0.7699462890625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.9375, + "rewards/margins": 7.0, + "rewards/rejected": 3.921875, + "step": 55, + "train_speed(iter/s)": 0.04427 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.09556035300668257, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.0023040771484375, + "logps/chosen": -502.0, + "logps/rejected": -494.0, + "loss": 0.7938720703125, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.53125, + "rewards/rejected": 3.875, + "step": 60, + "train_speed(iter/s)": 0.044449 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0625, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.654296875, + "eval_nll_loss": 0.83984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 4.34375, + "eval_runtime": 2.5187, + "eval_samples_per_second": 1.588, + "eval_steps_per_second": 0.794, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0731266213113978, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01507568359375, + "logits/rejected": -0.07421875, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.803509521484375, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 6.84375, + "rewards/rejected": 4.59375, + "step": 65, + "train_speed(iter/s)": 0.04445 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0688101140360203, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.076171875, + "logits/rejected": 0.318359375, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.69508056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 7.09375, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.044511 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07652393042135892, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09716796875, + "logits/rejected": -0.045166015625, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.8555908203125, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.6875, + "rewards/rejected": 4.15625, + "step": 75, + "train_speed(iter/s)": 0.044231 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.055938241299950876, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.2109375, + "logits/rejected": -0.1103515625, + "logps/chosen": -502.0, + "logps/rejected": -444.0, + "loss": 0.715106201171875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 7.5625, + "rewards/rejected": 4.28125, + "step": 80, + "train_speed(iter/s)": 0.044286 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.271484375, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -350.0, + "eval_loss": 0.6416015625, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.3125, + "eval_rewards/rejected": 4.3125, + "eval_runtime": 2.5165, + "eval_samples_per_second": 1.59, + "eval_steps_per_second": 0.795, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7429450858496.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3a7e217a66a8a2c44186cd341e5b27b0c840a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728444243ff1c68ff721c9ed45f1f8325dfd02d7c2b2e9c7eed677fb4ca9b355 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..288a2b0d1f5be1c0f8a5f3bf6f1f691ae8fc0adf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..26f6016f7f6851be1e7e978a5ece9702f7fa6258 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2f765d542f8f3758f47847c150eebc2c294d33a7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3d1b4aa71f2a1828c5925c9c3a3caf3cbfcf9866 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1ab837e823d2a06e2d0098849a955b3dfb0efefc Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..dc6730e48c6f22fa700491e72c3c772887e5c659 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..8cead5875b75d617a13f1247e15b17c39a5168b6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..ccaee2310d50a5ce8d2d3af8bb1eab69e27fe168 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..1d292e611d8382d31cd14bd9590ba2f12ac14ccd Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..4e77b78746bd6ae901d06f708b5fea92b4d932b0 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..0c9be2d051e9ed14b7dfd411e755f1053e1301cd Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..2b680ffb0e5912d0cf27751c922d3d29f2413741 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..6a3f4dfa5f8def0bf8f266f3a3956b575ab0bb60 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..2c72126dce5328d7e5cbf3f3ba730b04185f2ff2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..67583e96d840004a072a14b1b68989b4b448eb5c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..a138c3207804ef5be69dc8ff0d41e1abf82a564e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..7a3595505fea2b4a9d76ccf85974f080ce7c7b72 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..5bf985e69732284505502d14e64d24a8cf121fea Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..85210bcaf1fb1b76d743ccb064e7a48016d93ff8 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..fff7208ae5eae6d8fb30b4b2ab76da74d99336fe Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b186abf1abaf7f3184bdfc2a1ff2b2948d697012 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..a03d47fc57cb60dec7ec863cb703445c8585106a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..86b0b28c07ad5ace66bd18a17f08a3bfad81bcb2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..4d8f9f3920003befbc98b464cb2d8781174bc9a3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..27f067e0b5fca34493a8bd1a5266085ef32d1695 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..9809a18cca390005cd22b9a7299faa09f486cc35 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..dcfe62198d4c866a7b585d8f8c6c2e10bbfedf6b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..ec872f7a167edca016ee95a1b6cee26d2b133194 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..f5446b4c4cd1e2d84338a83997901e8829aba028 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..e4723ec07c821659b89c50853d71e02eb60fc9d2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3e672965e2eb78370c12c003692c9b6b4a122e3c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..b376ff1d4cf065d9bc511e559d065d74673342e3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..e10ccb785b5b5f80ae42e3a6908a6d46acdc2191 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/logging.jsonl b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b5acf957a2ce42f939065609303ee9738e47196 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/logging.jsonl @@ -0,0 +1,33 @@ +{"loss": 1.95458984, "grad_norm": 1.20430698, "learning_rate": 1.667e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.028637, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.12695312, "logits/chosen": -0.23828125, "nll_loss": 1.59375, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "31s", "remaining_time": "1h 2m 10s"} +{"loss": 2.47668457, "grad_norm": 1.4247437, "learning_rate": 8.333e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.040155, "rewards/chosen": 0.03125, "rewards/rejected": 0.03125, "rewards/accuracies": 0.21875, "rewards/margins": -7.63e-06, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.08398438, "logits/chosen": -0.0859375, "nll_loss": 1.515625, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "2m 0s", "remaining_time": "46m 21s"} +{"loss": 1.94570313, "grad_norm": 1.2234842, "learning_rate": 9.97e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.042924, "rewards/chosen": 0.62890625, "rewards/rejected": 0.546875, "rewards/accuracies": 0.55000001, "rewards/margins": 0.08398438, "logps/rejected": -592.0, "logps/chosen": -632.0, "logits/rejected": 0.01361084, "logits/chosen": 0.13183594, "nll_loss": 1.109375, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "3m 49s", "remaining_time": "42m 3s"} +{"loss": 2.1078125, "grad_norm": 0.73546609, "learning_rate": 9.847e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043724, "rewards/chosen": 2.609375, "rewards/rejected": 2.25, "rewards/accuracies": 0.69999999, "rewards/margins": 0.35742188, "logps/rejected": -652.0, "logps/chosen": -652.0, "logits/rejected": -0.02124023, "logits/chosen": -0.15332031, "nll_loss": 1.484375, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "5m 39s", "remaining_time": "39m 36s"} +{"loss": 1.65488281, "grad_norm": 0.74115261, "learning_rate": 9.632e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044054, "rewards/chosen": 5.09375, "rewards/rejected": 4.21875, "rewards/accuracies": 0.85000002, "rewards/margins": 0.8828125, "logps/rejected": -580.0, "logps/chosen": -424.0, "logits/rejected": 0.01037598, "logits/chosen": -0.07958984, "nll_loss": 1.2109375, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 30s", "remaining_time": "37m 32s"} +{"eval_loss": 1.14257812, "eval_runtime": 2.5109, "eval_samples_per_second": 1.593, "eval_steps_per_second": 0.797, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": 5.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 1.25, "eval_logps/rejected": -338.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.23242188, "eval_nll_loss": 1.296875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 32s", "remaining_time": "37m 44s"} +{"loss": 1.21779785, "grad_norm": 0.5424988, "learning_rate": 9.33e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043339, "rewards/chosen": 7.25, "rewards/rejected": 5.09375, "rewards/accuracies": 0.95454544, "rewards/margins": 2.15625, "logps/rejected": -568.0, "logps/chosen": -552.0, "logits/rejected": 0.12792969, "logits/chosen": 0.10058594, "nll_loss": 0.9453125, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "9m 33s", "remaining_time": "36m 18s"} +{"loss": 0.92021484, "grad_norm": 0.26583883, "learning_rate": 8.946e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043908, "rewards/chosen": 8.875, "rewards/rejected": 4.375, "rewards/accuracies": 0.97500002, "rewards/margins": 4.5, "logps/rejected": -600.0, "logps/chosen": -540.0, "logits/rejected": -0.09716797, "logits/chosen": 0.09667969, "nll_loss": 0.89453125, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "11m 19s", "remaining_time": "33m 59s"} +{"loss": 0.86738281, "grad_norm": 0.15066669, "learning_rate": 8.486e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044234, "rewards/chosen": 9.625, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 4.9375, "logps/rejected": -484.0, "logps/chosen": -576.0, "logits/rejected": -0.08984375, "logits/chosen": -0.08984375, "nll_loss": 0.8515625, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "13m 7s", "remaining_time": "31m 52s"} +{"loss": 0.78349609, "grad_norm": 0.07923738, "learning_rate": 7.961e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044579, "rewards/chosen": 10.0, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 5.90625, "logps/rejected": -632.0, "logps/chosen": -564.0, "logits/rejected": 0.07275391, "logits/chosen": 0.3203125, "nll_loss": 0.8203125, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 53s", "remaining_time": "29m 47s"} +{"eval_loss": 0.66748047, "eval_runtime": 2.6003, "eval_samples_per_second": 1.538, "eval_steps_per_second": 0.769, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.75, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1192.0, "eval_logits/rejected": -1.0859375, "eval_logits/chosen": 0.1484375, "eval_nll_loss": 0.84375, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 56s", "remaining_time": "29m 52s"} +{"loss": 0.80941162, "grad_norm": 0.0941121, "learning_rate": 7.38e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044378, "rewards/chosen": 10.1875, "rewards/rejected": 4.1875, "rewards/accuracies": 1.0, "rewards/margins": 6.0, "logps/rejected": -668.0, "logps/chosen": -428.0, "logits/rejected": 0.1015625, "logits/chosen": -0.05322266, "nll_loss": 0.8359375, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "16m 50s", "remaining_time": "28m 4s"} +{"loss": 0.85614014, "grad_norm": 0.12369237, "learning_rate": 6.753e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044098, "rewards/chosen": 10.4375, "rewards/rejected": 3.609375, "rewards/accuracies": 1.0, "rewards/margins": 6.8125, "logps/rejected": -560.0, "logps/chosen": -446.0, "logits/rejected": 0.10449219, "logits/chosen": 0.09228516, "nll_loss": 0.7578125, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "18m 50s", "remaining_time": "26m 22s"} +{"loss": 0.76994629, "grad_norm": 0.06812108, "learning_rate": 6.093e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04427, "rewards/chosen": 10.9375, "rewards/rejected": 3.921875, "rewards/accuracies": 1.0, "rewards/margins": 7.0, "logps/rejected": -482.0, "logps/chosen": -468.0, "logits/rejected": -0.07568359, "logits/chosen": 0.16894531, "nll_loss": 0.76171875, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "20m 38s", "remaining_time": "24m 24s"} +{"loss": 0.79387207, "grad_norm": 0.09556035, "learning_rate": 5.413e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044449, "rewards/chosen": 11.375, "rewards/rejected": 3.875, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -494.0, "logps/chosen": -502.0, "logits/rejected": -0.00230408, "logits/chosen": 0.18359375, "nll_loss": 0.7265625, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "22m 26s", "remaining_time": "22m 26s"} +{"eval_loss": 0.65429688, "eval_runtime": 2.5187, "eval_samples_per_second": 1.588, "eval_steps_per_second": 0.794, "eval_rewards/chosen": 11.75, "eval_rewards/rejected": 4.34375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.40625, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0625, "eval_logits/chosen": 0.2265625, "eval_nll_loss": 0.83984375, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "22m 28s", "remaining_time": "22m 28s"} +{"loss": 0.80350952, "grad_norm": 0.07312662, "learning_rate": 4.725e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04445, "rewards/chosen": 11.4375, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 6.84375, "logps/rejected": -664.0, "logps/chosen": -560.0, "logits/rejected": -0.07421875, "logits/chosen": -0.01507568, "nll_loss": 0.76171875, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "24m 18s", "remaining_time": "20m 34s"} +{"loss": 0.69508057, "grad_norm": 0.06881011, "learning_rate": 4.041e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044511, "rewards/chosen": 11.3125, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 7.09375, "logps/rejected": -632.0, "logps/chosen": -360.0, "logits/rejected": 0.31835938, "logits/chosen": 0.07617188, "nll_loss": 0.66796875, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "26m 9s", "remaining_time": "18m 40s"} +{"loss": 0.85559082, "grad_norm": 0.07652393, "learning_rate": 3.377e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044231, "rewards/chosen": 11.8125, "rewards/rejected": 4.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.6875, "logps/rejected": -544.0, "logps/chosen": -536.0, "logits/rejected": -0.04516602, "logits/chosen": 0.09716797, "nll_loss": 0.70703125, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "28m 12s", "remaining_time": "16m 55s"} +{"loss": 0.7151062, "grad_norm": 0.05593824, "learning_rate": 2.742e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044286, "rewards/chosen": 11.8125, "rewards/rejected": 4.28125, "rewards/accuracies": 1.0, "rewards/margins": 7.5625, "logps/rejected": -444.0, "logps/chosen": -502.0, "logits/rejected": -0.11035156, "logits/chosen": 0.2109375, "nll_loss": 0.70703125, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "30m 2s", "remaining_time": "15m 1s"} +{"eval_loss": 0.64160156, "eval_runtime": 2.5165, "eval_samples_per_second": 1.59, "eval_steps_per_second": 0.795, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.3125, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.046875, "eval_logits/chosen": 0.27148438, "eval_nll_loss": 0.8203125, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "30m 5s", "remaining_time": "15m 2s"} +{"loss": 0.75612793, "grad_norm": 0.06568256, "learning_rate": 2.151e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044291, "rewards/chosen": 12.125, "rewards/rejected": 4.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.96875, "logps/rejected": -608.0, "logps/chosen": -608.0, "logits/rejected": 0.09423828, "logits/chosen": 0.18652344, "nll_loss": 0.8125, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "31m 55s", "remaining_time": "13m 8s"} +{"loss": 0.67840576, "grad_norm": 0.06612494, "learning_rate": 1.614e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043843, "rewards/chosen": 11.625, "rewards/rejected": 3.859375, "rewards/accuracies": 1.0, "rewards/margins": 7.78125, "logps/rejected": -576.0, "logps/chosen": -434.0, "logits/rejected": 0.09033203, "logits/chosen": 0.09375, "nll_loss": 0.65234375, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "34m 9s", "remaining_time": "11m 23s"} +{"loss": 0.74968872, "grad_norm": 0.07071323, "learning_rate": 1.14e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043933, "rewards/chosen": 11.25, "rewards/rejected": 3.59375, "rewards/accuracies": 1.0, "rewards/margins": 7.625, "logps/rejected": -632.0, "logps/chosen": -368.0, "logits/rejected": 0.14355469, "logits/chosen": 0.05249023, "nll_loss": 0.71484375, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "35m 58s", "remaining_time": "9m 28s"} +{"loss": 0.82855835, "grad_norm": 0.06933003, "learning_rate": 7.4e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043751, "rewards/chosen": 11.625, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -608.0, "logps/chosen": -364.0, "logits/rejected": 0.21875, "logits/chosen": -0.00747681, "nll_loss": 0.65625, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "38m 2s", "remaining_time": "7m 36s"} +{"eval_loss": 0.63623047, "eval_runtime": 2.6531, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.28710938, "eval_nll_loss": 0.8125, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "38m 4s", "remaining_time": "7m 36s"} +{"loss": 0.71901855, "grad_norm": 0.08115599, "learning_rate": 4.21e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043728, "rewards/chosen": 12.1875, "rewards/rejected": 4.4375, "rewards/accuracies": 1.0, "rewards/margins": 7.78125, "logps/rejected": -540.0, "logps/chosen": -422.0, "logits/rejected": 0.05810547, "logits/chosen": 0.0612793, "nll_loss": 0.6796875, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "39m 57s", "remaining_time": "5m 42s"} +{"loss": 0.81259766, "grad_norm": 0.06209746, "learning_rate": 1.89e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043777, "rewards/chosen": 12.5, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -856.0, "logps/chosen": -712.0, "logits/rejected": 0.13476562, "logits/chosen": 0.13574219, "nll_loss": 0.875, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "41m 49s", "remaining_time": "3m 48s"} +{"loss": 0.81141357, "grad_norm": 0.12783918, "learning_rate": 4.7e-07, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043804, "rewards/chosen": 12.25, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -580.0, "logps/chosen": -584.0, "logits/rejected": -0.11572266, "logits/chosen": 0.16210938, "nll_loss": 0.8515625, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "43m 41s", "remaining_time": "1m 53s"} +{"loss": 0.65443115, "grad_norm": 0.08388909, "learning_rate": 0.0, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043853, "rewards/chosen": 12.5625, "rewards/rejected": 3.796875, "rewards/accuracies": 1.0, "rewards/margins": 8.75, "logps/rejected": -704.0, "logps/chosen": -576.0, "logits/rejected": 0.1015625, "logits/chosen": 0.21875, "nll_loss": 0.70703125, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "45m 32s", "remaining_time": "0s"} +{"eval_loss": 0.63720703, "eval_runtime": 2.4599, "eval_samples_per_second": 1.626, "eval_steps_per_second": 0.813, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.28515625, "eval_nll_loss": 0.81640625, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "45m 35s", "remaining_time": "0s"} +{"train_runtime": 2736.2309, "train_samples_per_second": 0.724, "train_steps_per_second": 0.044, "total_flos": 11174281019392.0, "train_loss": 1.00743561, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "45m 36s", "remaining_time": "0s"} +{"train_dataset": "1175.542929±552.835821, min=300.000000, max=6095.000000, size=396", "val_dataset": "1179.000000±512.550973, min=698.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1786.3204M Params (9.2324M Trainable [0.5168%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-120", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/checkpoint-100", "best_metric": 0.63623047, "global_step": 120, "log_history": [{"loss": 1.95458984375, "grad_norm": 1.204306976098353, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.028637, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.126953125, "logits/chosen": -0.23828125, "nll_loss": 1.59375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 2.4766845703125, "grad_norm": 1.4247437031946648, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.040155, "rewards/chosen": 0.03125, "rewards/rejected": 0.03125, "rewards/accuracies": 0.21875, "rewards/margins": -7.62939453125e-06, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.083984375, "logits/chosen": -0.0859375, "nll_loss": 1.515625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.945703125, "grad_norm": 1.2234841967992873, "learning_rate": 9.969653386589748e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.042924, "rewards/chosen": 0.62890625, "rewards/rejected": 0.546875, "rewards/accuracies": 0.550000011920929, "rewards/margins": 0.083984375, "logps/rejected": -592.0, "logps/chosen": -632.0, "logits/rejected": 0.01361083984375, "logits/chosen": 0.1318359375, "nll_loss": 1.109375, "epoch": 0.40404040404040403, "step": 10}, {"loss": 2.1078125, "grad_norm": 0.7354660931188922, "learning_rate": 9.847001329696653e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043724, "rewards/chosen": 2.609375, "rewards/rejected": 2.25, "rewards/accuracies": 0.699999988079071, "rewards/margins": 0.357421875, "logps/rejected": -652.0, "logps/chosen": -652.0, "logits/rejected": -0.021240234375, "logits/chosen": -0.1533203125, "nll_loss": 1.484375, "epoch": 0.6060606060606061, "step": 15}, {"loss": 1.6548828125, "grad_norm": 0.7411526143175573, "learning_rate": 9.632470336074009e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044054, "rewards/chosen": 5.09375, "rewards/rejected": 4.21875, "rewards/accuracies": 0.8500000238418579, "rewards/margins": 0.8828125, "logps/rejected": -580.0, "logps/chosen": -424.0, "logits/rejected": 0.0103759765625, "logits/chosen": -0.07958984375, "nll_loss": 1.2109375, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 1.142578125, "eval_runtime": 2.5109, "eval_samples_per_second": 1.593, "eval_steps_per_second": 0.797, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": 5.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 1.25, "eval_logps/rejected": -338.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.232421875, "eval_nll_loss": 1.296875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 1.2177978515625, "grad_norm": 0.542498800867474, "learning_rate": 9.330127018922194e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043339, "rewards/chosen": 7.25, "rewards/rejected": 5.09375, "rewards/accuracies": 0.9545454382896423, "rewards/margins": 2.15625, "logps/rejected": -568.0, "logps/chosen": -552.0, "logits/rejected": 0.1279296875, "logits/chosen": 0.1005859375, "nll_loss": 0.9453125, "epoch": 1.0404040404040404, "step": 25}, {"loss": 0.92021484375, "grad_norm": 0.265838833128634, "learning_rate": 8.945702546981969e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043908, "rewards/chosen": 8.875, "rewards/rejected": 4.375, "rewards/accuracies": 0.9750000238418579, "rewards/margins": 4.5, "logps/rejected": -600.0, "logps/chosen": -540.0, "logits/rejected": -0.09716796875, "logits/chosen": 0.0966796875, "nll_loss": 0.89453125, "epoch": 1.2424242424242424, "step": 30}, {"loss": 0.8673828125, "grad_norm": 0.1506666851461901, "learning_rate": 8.486484005469977e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044234, "rewards/chosen": 9.625, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 4.9375, "logps/rejected": -484.0, "logps/chosen": -576.0, "logits/rejected": -0.08984375, "logits/chosen": -0.08984375, "nll_loss": 0.8515625, "epoch": 1.4444444444444444, "step": 35}, {"loss": 0.78349609375, "grad_norm": 0.07923738211951831, "learning_rate": 7.961176263324901e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044579, "rewards/chosen": 10.0, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 5.90625, "logps/rejected": -632.0, "logps/chosen": -564.0, "logits/rejected": 0.07275390625, "logits/chosen": 0.3203125, "nll_loss": 0.8203125, "epoch": 1.6464646464646466, "step": 40}, {"eval_loss": 0.66748046875, "eval_runtime": 2.6003, "eval_samples_per_second": 1.538, "eval_steps_per_second": 0.769, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.75, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1192.0, "eval_logits/rejected": -1.0859375, "eval_logits/chosen": 0.1484375, "eval_nll_loss": 0.84375, "epoch": 1.6464646464646466, "step": 40}, {"loss": 0.80941162109375, "grad_norm": 0.09411209765685535, "learning_rate": 7.379736965185368e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044378, "rewards/chosen": 10.1875, "rewards/rejected": 4.1875, "rewards/accuracies": 1.0, "rewards/margins": 6.0, "logps/rejected": -668.0, "logps/chosen": -428.0, "logits/rejected": 0.1015625, "logits/chosen": -0.05322265625, "nll_loss": 0.8359375, "epoch": 1.8484848484848486, "step": 45}, {"loss": 0.85614013671875, "grad_norm": 0.12369237367680226, "learning_rate": 6.753187775963773e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044098, "rewards/chosen": 10.4375, "rewards/rejected": 3.609375, "rewards/accuracies": 1.0, "rewards/margins": 6.8125, "logps/rejected": -560.0, "logps/chosen": -446.0, "logits/rejected": 0.1044921875, "logits/chosen": 0.09228515625, "nll_loss": 0.7578125, "epoch": 2.080808080808081, "step": 50}, {"loss": 0.7699462890625, "grad_norm": 0.0681210790824193, "learning_rate": 6.09340545603188e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04427, "rewards/chosen": 10.9375, "rewards/rejected": 3.921875, "rewards/accuracies": 1.0, "rewards/margins": 7.0, "logps/rejected": -482.0, "logps/chosen": -468.0, "logits/rejected": -0.07568359375, "logits/chosen": 0.1689453125, "nll_loss": 0.76171875, "epoch": 2.282828282828283, "step": 55}, {"loss": 0.7938720703125, "grad_norm": 0.09556035300668257, "learning_rate": 5.4128967273616625e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044449, "rewards/chosen": 11.375, "rewards/rejected": 3.875, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -494.0, "logps/chosen": -502.0, "logits/rejected": -0.0023040771484375, "logits/chosen": 0.18359375, "nll_loss": 0.7265625, "epoch": 2.484848484848485, "step": 60}, {"eval_loss": 0.654296875, "eval_runtime": 2.5187, "eval_samples_per_second": 1.588, "eval_steps_per_second": 0.794, "eval_rewards/chosen": 11.75, "eval_rewards/rejected": 4.34375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.40625, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0625, "eval_logits/chosen": 0.2265625, "eval_nll_loss": 0.83984375, "epoch": 2.484848484848485, "step": 60}, {"loss": 0.803509521484375, "grad_norm": 0.0731266213113978, "learning_rate": 4.7245611982206724e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04445, "rewards/chosen": 11.4375, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 6.84375, "logps/rejected": -664.0, "logps/chosen": -560.0, "logits/rejected": -0.07421875, "logits/chosen": -0.01507568359375, "nll_loss": 0.76171875, "epoch": 2.686868686868687, "step": 65}, {"loss": 0.69508056640625, "grad_norm": 0.0688101140360203, "learning_rate": 4.0414468403813095e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044511, "rewards/chosen": 11.3125, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 7.09375, "logps/rejected": -632.0, "logps/chosen": -360.0, "logits/rejected": 0.318359375, "logits/chosen": 0.076171875, "nll_loss": 0.66796875, "epoch": 2.888888888888889, "step": 70}, {"loss": 0.8555908203125, "grad_norm": 0.07652393042135892, "learning_rate": 3.3765026539765834e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044231, "rewards/chosen": 11.8125, "rewards/rejected": 4.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.6875, "logps/rejected": -544.0, "logps/chosen": -536.0, "logits/rejected": -0.045166015625, "logits/chosen": 0.09716796875, "nll_loss": 0.70703125, "epoch": 3.121212121212121, "step": 75}, {"loss": 0.715106201171875, "grad_norm": 0.055938241299950876, "learning_rate": 2.7423332084455544e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044286, "rewards/chosen": 11.8125, "rewards/rejected": 4.28125, "rewards/accuracies": 1.0, "rewards/margins": 7.5625, "logps/rejected": -444.0, "logps/chosen": -502.0, "logits/rejected": -0.1103515625, "logits/chosen": 0.2109375, "nll_loss": 0.70703125, "epoch": 3.323232323232323, "step": 80}, {"eval_loss": 0.6416015625, "eval_runtime": 2.5165, "eval_samples_per_second": 1.59, "eval_steps_per_second": 0.795, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.3125, "eval_logps/rejected": -350.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.046875, "eval_logits/chosen": 0.271484375, "eval_nll_loss": 0.8203125, "epoch": 3.323232323232323, "step": 80}, {"loss": 0.7561279296875, "grad_norm": 0.06568256120217658, "learning_rate": 2.150959712448669e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044291, "rewards/chosen": 12.125, "rewards/rejected": 4.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.96875, "logps/rejected": -608.0, "logps/chosen": -608.0, "logits/rejected": 0.09423828125, "logits/chosen": 0.1865234375, "nll_loss": 0.8125, "epoch": 3.525252525252525, "step": 85}, {"loss": 0.67840576171875, "grad_norm": 0.06612494124440302, "learning_rate": 1.6135921418712956e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043843, "rewards/chosen": 11.625, "rewards/rejected": 3.859375, "rewards/accuracies": 1.0, "rewards/margins": 7.78125, "logps/rejected": -576.0, "logps/chosen": -434.0, "logits/rejected": 0.09033203125, "logits/chosen": 0.09375, "nll_loss": 0.65234375, "epoch": 3.7272727272727275, "step": 90}, {"loss": 0.749688720703125, "grad_norm": 0.07071323350474046, "learning_rate": 1.1404167454183957e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043933, "rewards/chosen": 11.25, "rewards/rejected": 3.59375, "rewards/accuracies": 1.0, "rewards/margins": 7.625, "logps/rejected": -632.0, "logps/chosen": -368.0, "logits/rejected": 0.1435546875, "logits/chosen": 0.052490234375, "nll_loss": 0.71484375, "epoch": 3.929292929292929, "step": 95}, {"loss": 0.828558349609375, "grad_norm": 0.06933003179796686, "learning_rate": 7.404029558083653e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043751, "rewards/chosen": 11.625, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -608.0, "logps/chosen": -364.0, "logits/rejected": 0.21875, "logits/chosen": -0.007476806640625, "nll_loss": 0.65625, "epoch": 4.161616161616162, "step": 100}, {"eval_loss": 0.63623046875, "eval_runtime": 2.6531, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.287109375, "eval_nll_loss": 0.8125, "epoch": 4.161616161616162, "step": 100}, {"loss": 0.7190185546875, "grad_norm": 0.0811559898134451, "learning_rate": 4.2113336672471245e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043728, "rewards/chosen": 12.1875, "rewards/rejected": 4.4375, "rewards/accuracies": 1.0, "rewards/margins": 7.78125, "logps/rejected": -540.0, "logps/chosen": -422.0, "logits/rejected": 0.05810546875, "logits/chosen": 0.061279296875, "nll_loss": 0.6796875, "epoch": 4.363636363636363, "step": 105}, {"loss": 0.81259765625, "grad_norm": 0.062097460024984424, "learning_rate": 1.8865999845374793e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043777, "rewards/chosen": 12.5, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -856.0, "logps/chosen": -712.0, "logits/rejected": 0.134765625, "logits/chosen": 0.1357421875, "nll_loss": 0.875, "epoch": 4.565656565656566, "step": 110}, {"loss": 0.81141357421875, "grad_norm": 0.12783918074585626, "learning_rate": 4.738957681248379e-07, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043804, "rewards/chosen": 12.25, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -580.0, "logps/chosen": -584.0, "logits/rejected": -0.11572265625, "logits/chosen": 0.162109375, "nll_loss": 0.8515625, "epoch": 4.767676767676767, "step": 115}, {"loss": 0.65443115234375, "grad_norm": 0.0838890904333176, "learning_rate": 0.0, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.043853, "rewards/chosen": 12.5625, "rewards/rejected": 3.796875, "rewards/accuracies": 1.0, "rewards/margins": 8.75, "logps/rejected": -704.0, "logps/chosen": -576.0, "logits/rejected": 0.1015625, "logits/chosen": 0.21875, "nll_loss": 0.70703125, "epoch": 4.96969696969697, "step": 120}, {"eval_loss": 0.63720703125, "eval_runtime": 2.4599, "eval_samples_per_second": 1.626, "eval_steps_per_second": 0.813, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.28515625, "eval_nll_loss": 0.81640625, "epoch": 4.96969696969697, "step": 120}, {"train_runtime": 2736.2309, "train_samples_per_second": 0.724, "train_steps_per_second": 0.044, "total_flos": 11174281019392.0, "train_loss": 1.0074356079101563, "epoch": 4.96969696969697, "step": 120}], "memory": 25.8828125} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs/events.out.tfevents.1737729115.kml-dtmachine-18088-prod.79226.0 b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs/events.out.tfevents.1737729115.kml-dtmachine-18088-prod.79226.0 new file mode 100644 index 0000000000000000000000000000000000000000..7209475fe72cb837e430d68562ada22ec6086567 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-143034/runs/events.out.tfevents.1737729115.kml-dtmachine-18088-prod.79226.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73037e96c07c5f41afccadfcb01110a1e0a6108173b85a9aad338a1404c7940c +size 33762 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6602e2cd5f47e7714ea4df9914de4c94559e4fdf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:515113945d14e03b0a67104df4235d9d2bda2d0a788d4d24ddc1d055937d15a5 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7b6db292bc8e7595638749ec33921e1bfb70fac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd29694eab740cc6ba10aad51a9122d17b64218e4eefa5f4ea2a4978313f9a5e +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66a392be95faf32ff04fb3e2bb10eead6773176e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f0d5ccfb9a12c2b043ca3eeb7b2727656b76e56820b0049089de0bee985b93 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4630ab910148ec496a9371c15027fce0c8148456 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3f4e8e1a1b6f61ec61b812ff2ba61b0a7e5785cd590c83b6c86e91800cac88 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a85178f05a86ad889883386505dce2b07272a874 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b88a9dee27b13485971f65c54fd1abf7f03ebc80932a59b02e1688ef0cce8f +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4c709074970e061996a851c3f5b2babe1841b17b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.63623047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.5341713832460793, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.09521484375, + "logits/rejected": 0.123046875, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 1.214501953125, + "memory(GiB)": 25.88, + "nll_loss": 0.94921875, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.21875, + "rewards/margins": 2.109375, + "rewards/rejected": 5.125, + "step": 25, + "train_speed(iter/s)": 0.045353 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.24559828292137062, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.09423828125, + "logits/rejected": -0.1015625, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.91485595703125, + "memory(GiB)": 25.88, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5625, + "rewards/rejected": 4.3125, + "step": 30, + "train_speed(iter/s)": 0.045808 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1553007238676429, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.09130859375, + "logits/rejected": -0.08935546875, + "logps/chosen": -572.0, + "logps/rejected": -484.0, + "loss": 0.866796875, + "memory(GiB)": 25.88, + "nll_loss": 0.84765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 4.96875, + "rewards/rejected": 4.75, + "step": 35, + "train_speed(iter/s)": 0.046016 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0791928852302197, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.32421875, + "logits/rejected": 0.076171875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.78258056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.81640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.78125, + "rewards/rejected": 4.25, + "step": 40, + "train_speed(iter/s)": 0.046217 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.15234375, + "eval_logits/rejected": -1.078125, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.65625, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.6875, + "eval_samples_per_second": 1.488, + "eval_steps_per_second": 0.744, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.10127792485834027, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.0478515625, + "logits/rejected": 0.10693359375, + "logps/chosen": -426.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 5.8125, + "rewards/rejected": 4.34375, + "step": 45, + "train_speed(iter/s)": 0.046248 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12223882923443313, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09765625, + "logits/rejected": 0.111328125, + "logps/chosen": -444.0, + "logps/rejected": -556.0, + "loss": 0.85374755859375, + "memory(GiB)": 25.88, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 6.8125, + "rewards/rejected": 3.71875, + "step": 50, + "train_speed(iter/s)": 0.045842 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.06960454571278732, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1708984375, + "logits/rejected": -0.06884765625, + "logps/chosen": -468.0, + "logps/rejected": -480.0, + "loss": 0.769140625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 6.96875, + "rewards/rejected": 4.03125, + "step": 55, + "train_speed(iter/s)": 0.045954 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.11142613870327374, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.185546875, + "logits/rejected": 0.00433349609375, + "logps/chosen": -502.0, + "logps/rejected": -492.0, + "loss": 0.79381103515625, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.375, + "rewards/rejected": 4.0, + "step": 60, + "train_speed(iter/s)": 0.046182 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0703125, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64990234375, + "eval_nll_loss": 0.83203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.25, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.3346, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.857, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06831207602075139, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01300048828125, + "logits/rejected": -0.07470703125, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.801824951171875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 7.0, + "rewards/rejected": 4.46875, + "step": 65, + "train_speed(iter/s)": 0.046252 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0695294288435998, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.080078125, + "logits/rejected": 0.322265625, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.6947265625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.125, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.046346 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07630483079111013, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09912109375, + "logits/rejected": -0.04052734375, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.855328369140625, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 7.71875, + "rewards/rejected": 4.125, + "step": 75, + "train_speed(iter/s)": 0.046132 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.056810731843726946, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.21484375, + "logits/rejected": -0.10302734375, + "logps/chosen": -504.0, + "logps/rejected": -446.0, + "loss": 0.715185546875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 7.59375, + "rewards/rejected": 4.1875, + "step": 80, + "train_speed(iter/s)": 0.046264 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.275390625, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64111328125, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": 4.5625, + "eval_runtime": 2.5228, + "eval_samples_per_second": 1.586, + "eval_steps_per_second": 0.793, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.0661057850452002, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": 0.1923828125, + "logits/rejected": 0.0966796875, + "logps/chosen": -608.0, + "logps/rejected": -608.0, + "loss": 0.755145263671875, + "memory(GiB)": 25.88, + "nll_loss": 0.80859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 8.1875, + "rewards/rejected": 4.09375, + "step": 85, + "train_speed(iter/s)": 0.046287 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.06714498393054905, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": 0.10107421875, + "logits/rejected": 0.09423828125, + "logps/chosen": -434.0, + "logps/rejected": -576.0, + "loss": 0.677581787109375, + "memory(GiB)": 25.88, + "nll_loss": 0.6484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.6875, + "rewards/margins": 7.90625, + "rewards/rejected": 3.8125, + "step": 90, + "train_speed(iter/s)": 0.046375 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.07216333845405633, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": 0.059814453125, + "logits/rejected": 0.1484375, + "logps/chosen": -368.0, + "logps/rejected": -632.0, + "loss": 0.747625732421875, + "memory(GiB)": 25.88, + "nll_loss": 0.7109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 7.75, + "rewards/rejected": 3.515625, + "step": 95, + "train_speed(iter/s)": 0.046466 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.0693964252521182, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": 9.000301361083984e-06, + "logits/rejected": 0.2236328125, + "logps/chosen": -364.0, + "logps/rejected": -608.0, + "loss": 0.82763671875, + "memory(GiB)": 25.88, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 7.53125, + "rewards/rejected": 4.0625, + "step": 100, + "train_speed(iter/s)": 0.04631 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": 0.287109375, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.63623046875, + "eval_nll_loss": 0.8125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.125, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 2.7031, + "eval_samples_per_second": 1.48, + "eval_steps_per_second": 0.74, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9336058478592.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92c0077c396ee664c266fad41fc6490f0d2e562c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f983afc7dc47b420073feb86b01d62f0afc2819cc8b1bbcdae238ccaa8711bc3 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccfeedd0dfcb0b6b654b77e4fabbd3d08b772352 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7737f1c0569d6c72bf6b1db2193140d38f55d2dffc72d47280efb46d9f2e3d6b +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a65833a17679ca9df0064bacd5189f4835f669d5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c8111c45fbe966bf193c639f2ccb4e7285e9a2046e14be471a9294bd9f7e546 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd73fc484029ac8e61a42ce1da4c0c01c8a6e8e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f50fc238799cb875ba398f5645372c169d3d5a781b4b52e2b6d28c15ca91043 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea35f6170ea4973cf2b61b37628d99dd422f06f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b692f2e44a761349843c093c6e0f9a39386a746f0716c83b76c53131568bd73 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..1ff406405418d84068458850f74aecfc6224f793 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/latest @@ -0,0 +1 @@ +global_step122 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a49f44ba05d98a84fd55c18c4fa41c6437c8853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..79ef7e8924723bd699efa313eb78103d80b7edb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40007a79aad967206b797079ca5147beff46ee1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede1043a0735266b510faa06f578fa6ef180c11e994a142a88a13ac6f33eb78b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b11af340c34a64bc65323ea9f86f555ba7c89c68 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.63476562, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120", + "epoch": 4.96969696969697, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.5341713832460793, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.09521484375, + "logits/rejected": 0.123046875, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 1.214501953125, + "memory(GiB)": 25.88, + "nll_loss": 0.94921875, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.21875, + "rewards/margins": 2.109375, + "rewards/rejected": 5.125, + "step": 25, + "train_speed(iter/s)": 0.045353 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.24559828292137062, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.09423828125, + "logits/rejected": -0.1015625, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.91485595703125, + "memory(GiB)": 25.88, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5625, + "rewards/rejected": 4.3125, + "step": 30, + "train_speed(iter/s)": 0.045808 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1553007238676429, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.09130859375, + "logits/rejected": -0.08935546875, + "logps/chosen": -572.0, + "logps/rejected": -484.0, + "loss": 0.866796875, + "memory(GiB)": 25.88, + "nll_loss": 0.84765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 4.96875, + "rewards/rejected": 4.75, + "step": 35, + "train_speed(iter/s)": 0.046016 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0791928852302197, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.32421875, + "logits/rejected": 0.076171875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.78258056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.81640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.78125, + "rewards/rejected": 4.25, + "step": 40, + "train_speed(iter/s)": 0.046217 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.15234375, + "eval_logits/rejected": -1.078125, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.65625, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.6875, + "eval_samples_per_second": 1.488, + "eval_steps_per_second": 0.744, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.10127792485834027, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.0478515625, + "logits/rejected": 0.10693359375, + "logps/chosen": -426.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 5.8125, + "rewards/rejected": 4.34375, + "step": 45, + "train_speed(iter/s)": 0.046248 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12223882923443313, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09765625, + "logits/rejected": 0.111328125, + "logps/chosen": -444.0, + "logps/rejected": -556.0, + "loss": 0.85374755859375, + "memory(GiB)": 25.88, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 6.8125, + "rewards/rejected": 3.71875, + "step": 50, + "train_speed(iter/s)": 0.045842 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.06960454571278732, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1708984375, + "logits/rejected": -0.06884765625, + "logps/chosen": -468.0, + "logps/rejected": -480.0, + "loss": 0.769140625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 6.96875, + "rewards/rejected": 4.03125, + "step": 55, + "train_speed(iter/s)": 0.045954 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.11142613870327374, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.185546875, + "logits/rejected": 0.00433349609375, + "logps/chosen": -502.0, + "logps/rejected": -492.0, + "loss": 0.79381103515625, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.375, + "rewards/rejected": 4.0, + "step": 60, + "train_speed(iter/s)": 0.046182 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0703125, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64990234375, + "eval_nll_loss": 0.83203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.25, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.3346, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.857, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06831207602075139, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01300048828125, + "logits/rejected": -0.07470703125, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.801824951171875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 7.0, + "rewards/rejected": 4.46875, + "step": 65, + "train_speed(iter/s)": 0.046252 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0695294288435998, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.080078125, + "logits/rejected": 0.322265625, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.6947265625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.125, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.046346 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07630483079111013, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09912109375, + "logits/rejected": -0.04052734375, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.855328369140625, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 7.71875, + "rewards/rejected": 4.125, + "step": 75, + "train_speed(iter/s)": 0.046132 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.056810731843726946, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.21484375, + "logits/rejected": -0.10302734375, + "logps/chosen": -504.0, + "logps/rejected": -446.0, + "loss": 0.715185546875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 7.59375, + "rewards/rejected": 4.1875, + "step": 80, + "train_speed(iter/s)": 0.046264 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.275390625, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64111328125, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": 4.5625, + "eval_runtime": 2.5228, + "eval_samples_per_second": 1.586, + "eval_steps_per_second": 0.793, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.0661057850452002, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": 0.1923828125, + "logits/rejected": 0.0966796875, + "logps/chosen": -608.0, + "logps/rejected": -608.0, + "loss": 0.755145263671875, + "memory(GiB)": 25.88, + "nll_loss": 0.80859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 8.1875, + "rewards/rejected": 4.09375, + "step": 85, + "train_speed(iter/s)": 0.046287 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.06714498393054905, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": 0.10107421875, + "logits/rejected": 0.09423828125, + "logps/chosen": -434.0, + "logps/rejected": -576.0, + "loss": 0.677581787109375, + "memory(GiB)": 25.88, + "nll_loss": 0.6484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.6875, + "rewards/margins": 7.90625, + "rewards/rejected": 3.8125, + "step": 90, + "train_speed(iter/s)": 0.046375 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.07216333845405633, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": 0.059814453125, + "logits/rejected": 0.1484375, + "logps/chosen": -368.0, + "logps/rejected": -632.0, + "loss": 0.747625732421875, + "memory(GiB)": 25.88, + "nll_loss": 0.7109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.3125, + "rewards/margins": 7.75, + "rewards/rejected": 3.515625, + "step": 95, + "train_speed(iter/s)": 0.046466 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.0693964252521182, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": 9.000301361083984e-06, + "logits/rejected": 0.2236328125, + "logps/chosen": -364.0, + "logps/rejected": -608.0, + "loss": 0.82763671875, + "memory(GiB)": 25.88, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 7.53125, + "rewards/rejected": 4.0625, + "step": 100, + "train_speed(iter/s)": 0.04631 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": 0.287109375, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.63623046875, + "eval_nll_loss": 0.8125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.125, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 2.7031, + "eval_samples_per_second": 1.48, + "eval_steps_per_second": 0.74, + "step": 100 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.08188869975605054, + "learning_rate": 4.2113336672471245e-06, + "logits/chosen": 0.06640625, + "logits/rejected": 0.061279296875, + "logps/chosen": -422.0, + "logps/rejected": -540.0, + "loss": 0.71771240234375, + "memory(GiB)": 25.88, + "nll_loss": 0.67578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 7.75, + "rewards/rejected": 4.46875, + "step": 105, + "train_speed(iter/s)": 0.0463 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 0.06339462001786819, + "learning_rate": 1.8865999845374793e-06, + "logits/chosen": 0.140625, + "logits/rejected": 0.138671875, + "logps/chosen": -712.0, + "logps/rejected": -860.0, + "loss": 0.81170654296875, + "memory(GiB)": 25.88, + "nll_loss": 0.875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5625, + "rewards/margins": 8.5, + "rewards/rejected": 4.0625, + "step": 110, + "train_speed(iter/s)": 0.046346 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.12700913709178566, + "learning_rate": 4.738957681248379e-07, + "logits/chosen": 0.1669921875, + "logits/rejected": -0.10791015625, + "logps/chosen": -588.0, + "logps/rejected": -580.0, + "loss": 0.81134033203125, + "memory(GiB)": 25.88, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.125, + "rewards/margins": 8.1875, + "rewards/rejected": 3.96875, + "step": 115, + "train_speed(iter/s)": 0.04641 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.07997734087252309, + "learning_rate": 0.0, + "logits/chosen": 0.2255859375, + "logits/rejected": 0.10791015625, + "logps/chosen": -572.0, + "logps/rejected": -704.0, + "loss": 0.65350341796875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.6875, + "rewards/margins": 8.9375, + "rewards/rejected": 3.71875, + "step": 120, + "train_speed(iter/s)": 0.046432 + }, + { + "epoch": 4.96969696969697, + "eval_logits/chosen": 0.29296875, + "eval_logits/rejected": -1.03125, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.634765625, + "eval_nll_loss": 0.81640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.1875, + "eval_rewards/rejected": 4.375, + "eval_runtime": 2.2615, + "eval_samples_per_second": 1.769, + "eval_steps_per_second": 0.884, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 11174281019392.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dd58a40c37a62478372a975f7a0ff432e1372796 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aac04fcb1cec660eacfa4224f2745d556fb9c5011bfbcabe154a8883f93eae0 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbf1bbebb181cc4dfbccb39788c8747f0151c580 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfadb361ee71a0515f1ce4f3f85016db312aae8294adf1bc3448f910771cd4f6 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbb9f7865c3eac8d4cded3f0952bfbd322a8d29a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7229ba5bb34103978f73d3f45df30c5cd81b4ae7a481350470e4cf4e6b17c734 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f646310720c3b9f29d5af4447359561c1c90521 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef80b66cb9665fda10e1c287bf309068c067530e8e93159875f043b452f62af6 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba8e1fd68649ba2ea2f1c48fda23861a1de441a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2921198144698255700f0b59dce87792be511022eedd048e89f00c98ab60799d +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f3772e40de01a3caacd97879ed035070bae33293 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 1.13671875, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1917401104384.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb8d15adb69b4cca0db095c1517df4bc6cb7db4e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c260fa4bc5e9040b51766a33cb59c3f223d6652fc4d0ed6df277ecf9d25bc +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3897ae29c2a180e564fb0d1798d8cb7da5436a0e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4180efc63cc95795d45859f1bf77dc64dbf0e965814f6d1e6ea4f58fa476cd8f +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..942f9aab810f444d88dbf7484dc153c712ef159d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9cd88c4d4b9567abe75147a5bbb18650e978f56422bd61f74b0afc102ae576 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0551ccd65404bbe04e0ac3f0dd0c1ca9971210f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881b8e9eae6a85ffb4aa0cab8ea5abb663a77e610afa8a1d75230186dd867ea5 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e83b853fd0419c832eaa083b512f8dbad45738f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aba0361d28a4cab0466de43f19116b16733f73caf7f61f569f06160bf9c3edb +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a92360f81dc0d1c269fedb287208651cc9138bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.66748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.5341713832460793, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.09521484375, + "logits/rejected": 0.123046875, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 1.214501953125, + "memory(GiB)": 25.88, + "nll_loss": 0.94921875, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.21875, + "rewards/margins": 2.109375, + "rewards/rejected": 5.125, + "step": 25, + "train_speed(iter/s)": 0.045353 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.24559828292137062, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.09423828125, + "logits/rejected": -0.1015625, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.91485595703125, + "memory(GiB)": 25.88, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5625, + "rewards/rejected": 4.3125, + "step": 30, + "train_speed(iter/s)": 0.045808 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1553007238676429, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.09130859375, + "logits/rejected": -0.08935546875, + "logps/chosen": -572.0, + "logps/rejected": -484.0, + "loss": 0.866796875, + "memory(GiB)": 25.88, + "nll_loss": 0.84765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 4.96875, + "rewards/rejected": 4.75, + "step": 35, + "train_speed(iter/s)": 0.046016 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0791928852302197, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.32421875, + "logits/rejected": 0.076171875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.78258056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.81640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.78125, + "rewards/rejected": 4.25, + "step": 40, + "train_speed(iter/s)": 0.046217 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.15234375, + "eval_logits/rejected": -1.078125, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.65625, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.6875, + "eval_samples_per_second": 1.488, + "eval_steps_per_second": 0.744, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3927512711168.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..163699b59112a55a83a2c5eceb0641cfb5ef202e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c21b5fbb3e9d8c5979d01dad73f6911f7ad3ef4f5d7aa07694acf90a3186af1d +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e3a66ff61fb167914354e5180f4d6e310acdbea --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b263ddc55ba3a46f6a963e3f92e7f937f53232d4360f2fad1bd87c3f149389 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6b463193208556d9110fce213823e9a5fc4915c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc38cf0d70cbd181ec1d6f5b9d6738f9bbc0c7dad028140bc263887ce44375ee +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d66364434508a5c3748c75b1164d91f99972dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f573f7e68e47d235331723d53e0e2b94852e73ab87d18b19bf12a9a5ef66d00 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e87180594ae30c1652e34c6356ee94e7d56cc37b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353a8c6ad06e6545638a84b0ee3ff96bee9d1d72dbfec9baf13f431cec8c8b14 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d23e4c8eb7cf607b4ef75c39fc702df549051cd0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.64990234, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.5341713832460793, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.09521484375, + "logits/rejected": 0.123046875, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 1.214501953125, + "memory(GiB)": 25.88, + "nll_loss": 0.94921875, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.21875, + "rewards/margins": 2.109375, + "rewards/rejected": 5.125, + "step": 25, + "train_speed(iter/s)": 0.045353 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.24559828292137062, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.09423828125, + "logits/rejected": -0.1015625, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.91485595703125, + "memory(GiB)": 25.88, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5625, + "rewards/rejected": 4.3125, + "step": 30, + "train_speed(iter/s)": 0.045808 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1553007238676429, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.09130859375, + "logits/rejected": -0.08935546875, + "logps/chosen": -572.0, + "logps/rejected": -484.0, + "loss": 0.866796875, + "memory(GiB)": 25.88, + "nll_loss": 0.84765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 4.96875, + "rewards/rejected": 4.75, + "step": 35, + "train_speed(iter/s)": 0.046016 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0791928852302197, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.32421875, + "logits/rejected": 0.076171875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.78258056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.81640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.78125, + "rewards/rejected": 4.25, + "step": 40, + "train_speed(iter/s)": 0.046217 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.15234375, + "eval_logits/rejected": -1.078125, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.65625, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.6875, + "eval_samples_per_second": 1.488, + "eval_steps_per_second": 0.744, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.10127792485834027, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.0478515625, + "logits/rejected": 0.10693359375, + "logps/chosen": -426.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 5.8125, + "rewards/rejected": 4.34375, + "step": 45, + "train_speed(iter/s)": 0.046248 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12223882923443313, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09765625, + "logits/rejected": 0.111328125, + "logps/chosen": -444.0, + "logps/rejected": -556.0, + "loss": 0.85374755859375, + "memory(GiB)": 25.88, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 6.8125, + "rewards/rejected": 3.71875, + "step": 50, + "train_speed(iter/s)": 0.045842 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.06960454571278732, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1708984375, + "logits/rejected": -0.06884765625, + "logps/chosen": -468.0, + "logps/rejected": -480.0, + "loss": 0.769140625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 6.96875, + "rewards/rejected": 4.03125, + "step": 55, + "train_speed(iter/s)": 0.045954 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.11142613870327374, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.185546875, + "logits/rejected": 0.00433349609375, + "logps/chosen": -502.0, + "logps/rejected": -492.0, + "loss": 0.79381103515625, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.375, + "rewards/rejected": 4.0, + "step": 60, + "train_speed(iter/s)": 0.046182 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0703125, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64990234375, + "eval_nll_loss": 0.83203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.25, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.3346, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.857, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5616157884416.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9ee9c1fc33fbc6ebbb3e4aefa6a292f23aa6ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7896cd353e2cb4cd62a6e3b523f921236b8bdac1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1925a8be79872d27954197d2358c2082447101e5bd6f30a51504fba79dc9aac4 +size 18516456 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..343025c7f8f519ef65be9bfd7aa562e11c720d99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11c9fe7c0b8ad5d9ac379bcfab66f15f9e8a0f95 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e543eb99b202cc9e534817bbb6600f8164815eeefbef783e240eecefdb62ac1 +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a105d277bcacd95a1fdce2fa37baebf5186d559 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58cd4cbd2dfa5cd91be92c9044776623c7d0c93a187025074187345a176f69ee +size 55398320 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b11e2383d1fe0d9ae08f25ec5344cdf1324e262 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e1af91751f6f712831b27ccd88e9190db7cd0cb6957244cc0457a9f7f334be +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd5496587aabc679a0a739cef4cba4d2153581f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e0b21b942b6d1b79d1104c54d47ae2678c93499ec417d02cad0283144e3ea0 +size 388374 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46d28f2ba8ca4c7f825afcf1f40213aa963422e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.64111328, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 1.2045486456035128, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.126953125, + "logps/chosen": -552.0, + "logps/rejected": -1064.0, + "loss": 1.95458984375, + "memory(GiB)": 25.88, + "nll_loss": 1.59375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.027709 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.3416035124505692, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.0859375, + "logits/rejected": 0.08447265625, + "logps/chosen": -712.0, + "logps/rejected": -708.0, + "loss": 2.4532470703125, + "memory(GiB)": 25.88, + "nll_loss": 1.515625, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.006256103515625, + "step": 5, + "train_speed(iter/s)": 0.041711 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2187308832330528, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": 0.1298828125, + "logits/rejected": 0.01153564453125, + "logps/chosen": -632.0, + "logps/rejected": -592.0, + "loss": 1.98896484375, + "memory(GiB)": 25.88, + "nll_loss": 1.109375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.57421875, + "step": 10, + "train_speed(iter/s)": 0.044665 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7376667652661941, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": -0.02685546875, + "logps/chosen": -652.0, + "logps/rejected": -652.0, + "loss": 2.097216796875, + "memory(GiB)": 25.88, + "nll_loss": 1.484375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 2.609375, + "rewards/margins": 0.384765625, + "rewards/rejected": 2.234375, + "step": 15, + "train_speed(iter/s)": 0.04573 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.7414357248505932, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -0.08837890625, + "logits/rejected": 0.004486083984375, + "logps/chosen": -424.0, + "logps/rejected": -580.0, + "loss": 1.660498046875, + "memory(GiB)": 25.88, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 5.125, + "rewards/margins": 0.86328125, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.046282 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": 0.220703125, + "eval_logits/rejected": -1.046875, + "eval_logps/chosen": -1224.0, + "eval_logps/rejected": -338.0, + "eval_loss": 1.13671875, + "eval_nll_loss": 1.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 1.203125, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 2.3676, + "eval_samples_per_second": 1.689, + "eval_steps_per_second": 0.845, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.5341713832460793, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": 0.09521484375, + "logits/rejected": 0.123046875, + "logps/chosen": -556.0, + "logps/rejected": -568.0, + "loss": 1.214501953125, + "memory(GiB)": 25.88, + "nll_loss": 0.94921875, + "rewards/accuracies": 0.9545454382896423, + "rewards/chosen": 7.21875, + "rewards/margins": 2.109375, + "rewards/rejected": 5.125, + "step": 25, + "train_speed(iter/s)": 0.045353 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.24559828292137062, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": 0.09423828125, + "logits/rejected": -0.1015625, + "logps/chosen": -540.0, + "logps/rejected": -600.0, + "loss": 0.91485595703125, + "memory(GiB)": 25.88, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.875, + "rewards/margins": 4.5625, + "rewards/rejected": 4.3125, + "step": 30, + "train_speed(iter/s)": 0.045808 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.1553007238676429, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -0.09130859375, + "logits/rejected": -0.08935546875, + "logps/chosen": -572.0, + "logps/rejected": -484.0, + "loss": 0.866796875, + "memory(GiB)": 25.88, + "nll_loss": 0.84765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.75, + "rewards/margins": 4.96875, + "rewards/rejected": 4.75, + "step": 35, + "train_speed(iter/s)": 0.046016 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.0791928852302197, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": 0.32421875, + "logits/rejected": 0.076171875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.78258056640625, + "memory(GiB)": 25.88, + "nll_loss": 0.81640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.78125, + "rewards/rejected": 4.25, + "step": 40, + "train_speed(iter/s)": 0.046217 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": 0.15234375, + "eval_logits/rejected": -1.078125, + "eval_logps/chosen": -1192.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.66748046875, + "eval_nll_loss": 0.84375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 5.65625, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.6875, + "eval_samples_per_second": 1.488, + "eval_steps_per_second": 0.744, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.10127792485834027, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -0.0478515625, + "logits/rejected": 0.10693359375, + "logps/chosen": -426.0, + "logps/rejected": -668.0, + "loss": 0.80941162109375, + "memory(GiB)": 25.88, + "nll_loss": 0.8359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.1875, + "rewards/margins": 5.8125, + "rewards/rejected": 4.34375, + "step": 45, + "train_speed(iter/s)": 0.046248 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.12223882923443313, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": 0.09765625, + "logits/rejected": 0.111328125, + "logps/chosen": -444.0, + "logps/rejected": -556.0, + "loss": 0.85374755859375, + "memory(GiB)": 25.88, + "nll_loss": 0.75390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 6.8125, + "rewards/rejected": 3.71875, + "step": 50, + "train_speed(iter/s)": 0.045842 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.06960454571278732, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": 0.1708984375, + "logits/rejected": -0.06884765625, + "logps/chosen": -468.0, + "logps/rejected": -480.0, + "loss": 0.769140625, + "memory(GiB)": 25.88, + "nll_loss": 0.76171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 6.96875, + "rewards/rejected": 4.03125, + "step": 55, + "train_speed(iter/s)": 0.045954 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.11142613870327374, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": 0.185546875, + "logits/rejected": 0.00433349609375, + "logps/chosen": -502.0, + "logps/rejected": -492.0, + "loss": 0.79381103515625, + "memory(GiB)": 25.88, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.375, + "rewards/rejected": 4.0, + "step": 60, + "train_speed(iter/s)": 0.046182 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": 0.2265625, + "eval_logits/rejected": -1.0703125, + "eval_logps/chosen": -1176.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64990234375, + "eval_nll_loss": 0.83203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.75, + "eval_rewards/margins": 7.25, + "eval_rewards/rejected": 4.5, + "eval_runtime": 2.3346, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.857, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06831207602075139, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -0.01300048828125, + "logits/rejected": -0.07470703125, + "logps/chosen": -560.0, + "logps/rejected": -664.0, + "loss": 0.801824951171875, + "memory(GiB)": 25.88, + "nll_loss": 0.7578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.4375, + "rewards/margins": 7.0, + "rewards/rejected": 4.46875, + "step": 65, + "train_speed(iter/s)": 0.046252 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0695294288435998, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": 0.080078125, + "logits/rejected": 0.322265625, + "logps/chosen": -360.0, + "logps/rejected": -632.0, + "loss": 0.6947265625, + "memory(GiB)": 25.88, + "nll_loss": 0.66796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.375, + "rewards/margins": 7.125, + "rewards/rejected": 4.21875, + "step": 70, + "train_speed(iter/s)": 0.046346 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.07630483079111013, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": 0.09912109375, + "logits/rejected": -0.04052734375, + "logps/chosen": -536.0, + "logps/rejected": -544.0, + "loss": 0.855328369140625, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 7.71875, + "rewards/rejected": 4.125, + "step": 75, + "train_speed(iter/s)": 0.046132 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.056810731843726946, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": 0.21484375, + "logits/rejected": -0.10302734375, + "logps/chosen": -504.0, + "logps/rejected": -446.0, + "loss": 0.715185546875, + "memory(GiB)": 25.88, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 7.59375, + "rewards/rejected": 4.1875, + "step": 80, + "train_speed(iter/s)": 0.046264 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": 0.275390625, + "eval_logits/rejected": -1.0390625, + "eval_logps/chosen": -1168.0, + "eval_logps/rejected": -348.0, + "eval_loss": 0.64111328125, + "eval_nll_loss": 0.8203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.625, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": 4.5625, + "eval_runtime": 2.5228, + "eval_samples_per_second": 1.586, + "eval_steps_per_second": 0.793, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7429450858496.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..908203c3defb97cf621adc1c3875027503b210ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ef77af08fd3e710cf03f0fcffacee5bfc67e4f9d29ba1bc6650464d0f6f138 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..fa8bcbf513c51b620be5546f329ee950ed2979b5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1862d537b115f83f2e8a2727e33dda948d0e8ca7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2f765d542f8f3758f47847c150eebc2c294d33a7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..db25ac2ffb77931ebc542ed94858c24ae33895ff Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..723f5321425fcca2c1a1079f2bc0ba3cdb38541a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..ce3cd96d7b0267d11358780f1fa2b819b6b57fa1 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..8cead5875b75d617a13f1247e15b17c39a5168b6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..ccaee2310d50a5ce8d2d3af8bb1eab69e27fe168 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..d38a234997bb344f4b0644cde1f46061b6ed41f2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..55e064619087b21f5dccd4c274374e3a3b92d132 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..aa6e94714d44af6ab0cff75f0c0ecfc42a501e9a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..e61aac476691566e02b572fd62f0b7d22a954f23 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3d494e8bb0a7051c3db9d04473ff154dd965fb51 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..2c72126dce5328d7e5cbf3f3ba730b04185f2ff2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..5a769a14f8068691226ab6f3edfbf417a3ffa872 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..a138c3207804ef5be69dc8ff0d41e1abf82a564e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..a1fd3556517d619e801b5860d2591b65a52e8850 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..02ecee247ece0d512a26e2eae4d274a37b14f443 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..a0911fac053bb74cdd00305f90813f73237a54a3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..d7b5f967f27536f175aabcc1eae6c49f8f739a5a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..be490b4aa1dfb480d07bf87a3f5f8a77500154ee Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..a03d47fc57cb60dec7ec863cb703445c8585106a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..61d314f2669c413bbc5962ff4ca52f355fee0583 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..11e2153aed05617f30d32fce5b4c854ecb8fdc3d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e21fd197adbbcfba509fb1cc980f74f2149431f9 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..95233aca0c743004ee84d23e859d0e5ee0193dc1 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..8734de0fb21b509d967c4834835ced9de3a273b0 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..ec872f7a167edca016ee95a1b6cee26d2b133194 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6747fc814268b672e6b6c792a456e72b331ba214 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..d2a2eaff7200fffd4df18ad4e0bab1147facd9b4 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..d2d63f2c2106cff03bf74728d828eb1e3c75eaeb Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..6c12b9b1b80267b326e53bc402f3b53eb22e62c5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3a9a20bc3790970c5206eb7f5fa1714dc043ec1d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/logging.jsonl b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51d5662496275d610d8547d81c8ff962760b23d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/logging.jsonl @@ -0,0 +1,33 @@ +{"loss": 1.95458984, "grad_norm": 1.20454865, "learning_rate": 1.667e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.027709, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.12695312, "logits/chosen": -0.23828125, "nll_loss": 1.59375, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "32s", "remaining_time": "1h 3m 43s"} +{"loss": 2.45324707, "grad_norm": 1.34160351, "learning_rate": 8.333e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.041711, "rewards/chosen": 0.00939941, "rewards/rejected": 0.0062561, "rewards/accuracies": 0.25, "rewards/margins": 0.00311279, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.08447266, "logits/chosen": -0.0859375, "nll_loss": 1.515625, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "1m 55s", "remaining_time": "44m 26s"} +{"loss": 1.98896484, "grad_norm": 1.21873088, "learning_rate": 9.97e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044665, "rewards/chosen": 0.63671875, "rewards/rejected": 0.57421875, "rewards/accuracies": 0.5, "rewards/margins": 0.06152344, "logps/rejected": -592.0, "logps/chosen": -632.0, "logits/rejected": 0.01153564, "logits/chosen": 0.12988281, "nll_loss": 1.109375, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "3m 39s", "remaining_time": "40m 19s"} +{"loss": 2.0972168, "grad_norm": 0.73766677, "learning_rate": 9.847e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04573, "rewards/chosen": 2.609375, "rewards/rejected": 2.234375, "rewards/accuracies": 0.67500001, "rewards/margins": 0.38476562, "logps/rejected": -652.0, "logps/chosen": -652.0, "logits/rejected": -0.02685547, "logits/chosen": -0.15722656, "nll_loss": 1.484375, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "5m 24s", "remaining_time": "37m 48s"} +{"loss": 1.66049805, "grad_norm": 0.74143572, "learning_rate": 9.632e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046282, "rewards/chosen": 5.125, "rewards/rejected": 4.25, "rewards/accuracies": 0.77499998, "rewards/margins": 0.86328125, "logps/rejected": -580.0, "logps/chosen": -424.0, "logits/rejected": 0.00448608, "logits/chosen": -0.08837891, "nll_loss": 1.2109375, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 8s", "remaining_time": "35m 40s"} +{"eval_loss": 1.13671875, "eval_runtime": 2.3676, "eval_samples_per_second": 1.689, "eval_steps_per_second": 0.845, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": 5.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 1.203125, "eval_logps/rejected": -338.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.046875, "eval_logits/chosen": 0.22070312, "eval_nll_loss": 1.296875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "7m 10s", "remaining_time": "35m 52s"} +{"loss": 1.21450195, "grad_norm": 0.53417138, "learning_rate": 9.33e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045353, "rewards/chosen": 7.21875, "rewards/rejected": 5.125, "rewards/accuracies": 0.95454544, "rewards/margins": 2.109375, "logps/rejected": -568.0, "logps/chosen": -556.0, "logits/rejected": 0.12304688, "logits/chosen": 0.09521484, "nll_loss": 0.94921875, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "9m 7s", "remaining_time": "34m 39s"} +{"loss": 0.91485596, "grad_norm": 0.24559828, "learning_rate": 8.946e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045808, "rewards/chosen": 8.875, "rewards/rejected": 4.3125, "rewards/accuracies": 0.97500002, "rewards/margins": 4.5625, "logps/rejected": -600.0, "logps/chosen": -540.0, "logits/rejected": -0.1015625, "logits/chosen": 0.09423828, "nll_loss": 0.890625, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "10m 50s", "remaining_time": "32m 32s"} +{"loss": 0.86679688, "grad_norm": 0.15530072, "learning_rate": 8.486e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046016, "rewards/chosen": 9.75, "rewards/rejected": 4.75, "rewards/accuracies": 1.0, "rewards/margins": 4.96875, "logps/rejected": -484.0, "logps/chosen": -572.0, "logits/rejected": -0.08935547, "logits/chosen": -0.09130859, "nll_loss": 0.84765625, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "12m 36s", "remaining_time": "30m 37s"} +{"loss": 0.78258057, "grad_norm": 0.07919289, "learning_rate": 7.961e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046217, "rewards/chosen": 10.0, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 5.78125, "logps/rejected": -628.0, "logps/chosen": -564.0, "logits/rejected": 0.07617188, "logits/chosen": 0.32421875, "nll_loss": 0.81640625, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 21s", "remaining_time": "28m 43s"} +{"eval_loss": 0.66748047, "eval_runtime": 2.6875, "eval_samples_per_second": 1.488, "eval_steps_per_second": 0.744, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.65625, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1192.0, "eval_logits/rejected": -1.078125, "eval_logits/chosen": 0.15234375, "eval_nll_loss": 0.84375, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "14m 24s", "remaining_time": "28m 48s"} +{"loss": 0.80941162, "grad_norm": 0.10127792, "learning_rate": 7.38e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046248, "rewards/chosen": 10.1875, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 5.8125, "logps/rejected": -668.0, "logps/chosen": -426.0, "logits/rejected": 0.10693359, "logits/chosen": -0.04785156, "nll_loss": 0.8359375, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "16m 9s", "remaining_time": "26m 55s"} +{"loss": 0.85374756, "grad_norm": 0.12223883, "learning_rate": 6.753e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045842, "rewards/chosen": 10.5, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 6.8125, "logps/rejected": -556.0, "logps/chosen": -444.0, "logits/rejected": 0.11132812, "logits/chosen": 0.09765625, "nll_loss": 0.75390625, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "18m 6s", "remaining_time": "25m 21s"} +{"loss": 0.76914063, "grad_norm": 0.06960455, "learning_rate": 6.093e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045954, "rewards/chosen": 11.0, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 6.96875, "logps/rejected": -480.0, "logps/chosen": -468.0, "logits/rejected": -0.06884766, "logits/chosen": 0.17089844, "nll_loss": 0.76171875, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "19m 52s", "remaining_time": "23m 29s"} +{"loss": 0.79381104, "grad_norm": 0.11142614, "learning_rate": 5.413e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046182, "rewards/chosen": 11.375, "rewards/rejected": 4.0, "rewards/accuracies": 1.0, "rewards/margins": 7.375, "logps/rejected": -492.0, "logps/chosen": -502.0, "logits/rejected": 0.0043335, "logits/chosen": 0.18554688, "nll_loss": 0.7265625, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "21m 35s", "remaining_time": "21m 35s"} +{"eval_loss": 0.64990234, "eval_runtime": 2.3346, "eval_samples_per_second": 1.713, "eval_steps_per_second": 0.857, "eval_rewards/chosen": 11.75, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.25, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0703125, "eval_logits/chosen": 0.2265625, "eval_nll_loss": 0.83203125, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "21m 37s", "remaining_time": "21m 37s"} +{"loss": 0.80182495, "grad_norm": 0.06831208, "learning_rate": 4.725e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046252, "rewards/chosen": 11.4375, "rewards/rejected": 4.46875, "rewards/accuracies": 1.0, "rewards/margins": 7.0, "logps/rejected": -664.0, "logps/chosen": -560.0, "logits/rejected": -0.07470703, "logits/chosen": -0.01300049, "nll_loss": 0.7578125, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "23m 21s", "remaining_time": "19m 45s"} +{"loss": 0.69472656, "grad_norm": 0.06952943, "learning_rate": 4.041e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046346, "rewards/chosen": 11.375, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 7.125, "logps/rejected": -632.0, "logps/chosen": -360.0, "logits/rejected": 0.32226562, "logits/chosen": 0.08007812, "nll_loss": 0.66796875, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "25m 6s", "remaining_time": "17m 56s"} +{"loss": 0.85532837, "grad_norm": 0.07630483, "learning_rate": 3.377e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046132, "rewards/chosen": 11.875, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 7.71875, "logps/rejected": -544.0, "logps/chosen": -536.0, "logits/rejected": -0.04052734, "logits/chosen": 0.09912109, "nll_loss": 0.70703125, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "27m 1s", "remaining_time": "16m 13s"} +{"loss": 0.71518555, "grad_norm": 0.05681073, "learning_rate": 2.742e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046264, "rewards/chosen": 11.75, "rewards/rejected": 4.1875, "rewards/accuracies": 1.0, "rewards/margins": 7.59375, "logps/rejected": -446.0, "logps/chosen": -504.0, "logits/rejected": -0.10302734, "logits/chosen": 0.21484375, "nll_loss": 0.70703125, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "28m 45s", "remaining_time": "14m 22s"} +{"eval_loss": 0.64111328, "eval_runtime": 2.5228, "eval_samples_per_second": 1.586, "eval_steps_per_second": 0.793, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.0390625, "eval_logits/chosen": 0.27539062, "eval_nll_loss": 0.8203125, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "28m 47s", "remaining_time": "14m 23s"} +{"loss": 0.75514526, "grad_norm": 0.06610579, "learning_rate": 2.151e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046287, "rewards/chosen": 12.25, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -608.0, "logps/chosen": -608.0, "logits/rejected": 0.09667969, "logits/chosen": 0.19238281, "nll_loss": 0.80859375, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "30m 32s", "remaining_time": "12m 34s"} +{"loss": 0.67758179, "grad_norm": 0.06714498, "learning_rate": 1.614e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046375, "rewards/chosen": 11.6875, "rewards/rejected": 3.8125, "rewards/accuracies": 1.0, "rewards/margins": 7.90625, "logps/rejected": -576.0, "logps/chosen": -434.0, "logits/rejected": 0.09423828, "logits/chosen": 0.10107422, "nll_loss": 0.6484375, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "32m 16s", "remaining_time": "10m 45s"} +{"loss": 0.74762573, "grad_norm": 0.07216334, "learning_rate": 1.14e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046466, "rewards/chosen": 11.3125, "rewards/rejected": 3.515625, "rewards/accuracies": 1.0, "rewards/margins": 7.75, "logps/rejected": -632.0, "logps/chosen": -368.0, "logits/rejected": 0.1484375, "logits/chosen": 0.05981445, "nll_loss": 0.7109375, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "34m 0s", "remaining_time": "8m 56s"} +{"loss": 0.82763672, "grad_norm": 0.06939643, "learning_rate": 7.4e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04631, "rewards/chosen": 11.5625, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -608.0, "logps/chosen": -364.0, "logits/rejected": 0.22363281, "logits/chosen": 9e-06, "nll_loss": 0.65625, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "35m 55s", "remaining_time": "7m 11s"} +{"eval_loss": 0.63623047, "eval_runtime": 2.7031, "eval_samples_per_second": 1.48, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.28710938, "eval_nll_loss": 0.8125, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "35m 58s", "remaining_time": "7m 11s"} +{"loss": 0.7177124, "grad_norm": 0.0818887, "learning_rate": 4.21e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.0463, "rewards/chosen": 12.25, "rewards/rejected": 4.46875, "rewards/accuracies": 1.0, "rewards/margins": 7.75, "logps/rejected": -540.0, "logps/chosen": -422.0, "logits/rejected": 0.0612793, "logits/chosen": 0.06640625, "nll_loss": 0.67578125, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "37m 43s", "remaining_time": "5m 23s"} +{"loss": 0.81170654, "grad_norm": 0.06339462, "learning_rate": 1.89e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046346, "rewards/chosen": 12.5625, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 8.5, "logps/rejected": -860.0, "logps/chosen": -712.0, "logits/rejected": 0.13867188, "logits/chosen": 0.140625, "nll_loss": 0.875, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "39m 29s", "remaining_time": "3m 35s"} +{"loss": 0.81134033, "grad_norm": 0.12700914, "learning_rate": 4.7e-07, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04641, "rewards/chosen": 12.125, "rewards/rejected": 3.96875, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -580.0, "logps/chosen": -588.0, "logits/rejected": -0.10791016, "logits/chosen": 0.16699219, "nll_loss": 0.8515625, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "41m 13s", "remaining_time": "1m 47s"} +{"loss": 0.65350342, "grad_norm": 0.07997734, "learning_rate": 0.0, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046432, "rewards/chosen": 12.6875, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -704.0, "logps/chosen": -572.0, "logits/rejected": 0.10791016, "logits/chosen": 0.22558594, "nll_loss": 0.70703125, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 0s", "remaining_time": "0s"} +{"eval_loss": 0.63476562, "eval_runtime": 2.2615, "eval_samples_per_second": 1.769, "eval_steps_per_second": 0.884, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.1875, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.29296875, "eval_nll_loss": 0.81640625, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 2s", "remaining_time": "0s"} +{"train_runtime": 2583.6181, "train_samples_per_second": 0.766, "train_steps_per_second": 0.046, "total_flos": 11174281019392.0, "train_loss": 1.00726496, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "43m 3s", "remaining_time": "0s"} +{"train_dataset": "1175.542929±552.835821, min=300.000000, max=6095.000000, size=396", "val_dataset": "1179.000000±512.550973, min=698.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1786.3204M Params (9.2324M Trainable [0.5168%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/checkpoint-120", "best_metric": 0.63476562, "global_step": 120, "log_history": [{"loss": 1.95458984375, "grad_norm": 1.2045486456035128, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.027709, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -1064.0, "logps/chosen": -552.0, "logits/rejected": -0.126953125, "logits/chosen": -0.23828125, "nll_loss": 1.59375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 2.4532470703125, "grad_norm": 1.3416035124505692, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.041711, "rewards/chosen": 0.0093994140625, "rewards/rejected": 0.006256103515625, "rewards/accuracies": 0.25, "rewards/margins": 0.00311279296875, "logps/rejected": -708.0, "logps/chosen": -712.0, "logits/rejected": 0.08447265625, "logits/chosen": -0.0859375, "nll_loss": 1.515625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.98896484375, "grad_norm": 1.2187308832330528, "learning_rate": 9.969653386589748e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.044665, "rewards/chosen": 0.63671875, "rewards/rejected": 0.57421875, "rewards/accuracies": 0.5, "rewards/margins": 0.0615234375, "logps/rejected": -592.0, "logps/chosen": -632.0, "logits/rejected": 0.01153564453125, "logits/chosen": 0.1298828125, "nll_loss": 1.109375, "epoch": 0.40404040404040403, "step": 10}, {"loss": 2.097216796875, "grad_norm": 0.7376667652661941, "learning_rate": 9.847001329696653e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04573, "rewards/chosen": 2.609375, "rewards/rejected": 2.234375, "rewards/accuracies": 0.675000011920929, "rewards/margins": 0.384765625, "logps/rejected": -652.0, "logps/chosen": -652.0, "logits/rejected": -0.02685546875, "logits/chosen": -0.1572265625, "nll_loss": 1.484375, "epoch": 0.6060606060606061, "step": 15}, {"loss": 1.660498046875, "grad_norm": 0.7414357248505932, "learning_rate": 9.632470336074009e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046282, "rewards/chosen": 5.125, "rewards/rejected": 4.25, "rewards/accuracies": 0.7749999761581421, "rewards/margins": 0.86328125, "logps/rejected": -580.0, "logps/chosen": -424.0, "logits/rejected": 0.004486083984375, "logits/chosen": -0.08837890625, "nll_loss": 1.2109375, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 1.13671875, "eval_runtime": 2.3676, "eval_samples_per_second": 1.689, "eval_steps_per_second": 0.845, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": 5.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 1.203125, "eval_logps/rejected": -338.0, "eval_logps/chosen": -1224.0, "eval_logits/rejected": -1.046875, "eval_logits/chosen": 0.220703125, "eval_nll_loss": 1.296875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 1.214501953125, "grad_norm": 0.5341713832460793, "learning_rate": 9.330127018922194e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045353, "rewards/chosen": 7.21875, "rewards/rejected": 5.125, "rewards/accuracies": 0.9545454382896423, "rewards/margins": 2.109375, "logps/rejected": -568.0, "logps/chosen": -556.0, "logits/rejected": 0.123046875, "logits/chosen": 0.09521484375, "nll_loss": 0.94921875, "epoch": 1.0404040404040404, "step": 25}, {"loss": 0.91485595703125, "grad_norm": 0.24559828292137062, "learning_rate": 8.945702546981969e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045808, "rewards/chosen": 8.875, "rewards/rejected": 4.3125, "rewards/accuracies": 0.9750000238418579, "rewards/margins": 4.5625, "logps/rejected": -600.0, "logps/chosen": -540.0, "logits/rejected": -0.1015625, "logits/chosen": 0.09423828125, "nll_loss": 0.890625, "epoch": 1.2424242424242424, "step": 30}, {"loss": 0.866796875, "grad_norm": 0.1553007238676429, "learning_rate": 8.486484005469977e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046016, "rewards/chosen": 9.75, "rewards/rejected": 4.75, "rewards/accuracies": 1.0, "rewards/margins": 4.96875, "logps/rejected": -484.0, "logps/chosen": -572.0, "logits/rejected": -0.08935546875, "logits/chosen": -0.09130859375, "nll_loss": 0.84765625, "epoch": 1.4444444444444444, "step": 35}, {"loss": 0.78258056640625, "grad_norm": 0.0791928852302197, "learning_rate": 7.961176263324901e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046217, "rewards/chosen": 10.0, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 5.78125, "logps/rejected": -628.0, "logps/chosen": -564.0, "logits/rejected": 0.076171875, "logits/chosen": 0.32421875, "nll_loss": 0.81640625, "epoch": 1.6464646464646466, "step": 40}, {"eval_loss": 0.66748046875, "eval_runtime": 2.6875, "eval_samples_per_second": 1.488, "eval_steps_per_second": 0.744, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.65625, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1192.0, "eval_logits/rejected": -1.078125, "eval_logits/chosen": 0.15234375, "eval_nll_loss": 0.84375, "epoch": 1.6464646464646466, "step": 40}, {"loss": 0.80941162109375, "grad_norm": 0.10127792485834027, "learning_rate": 7.379736965185368e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046248, "rewards/chosen": 10.1875, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 5.8125, "logps/rejected": -668.0, "logps/chosen": -426.0, "logits/rejected": 0.10693359375, "logits/chosen": -0.0478515625, "nll_loss": 0.8359375, "epoch": 1.8484848484848486, "step": 45}, {"loss": 0.85374755859375, "grad_norm": 0.12223882923443313, "learning_rate": 6.753187775963773e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045842, "rewards/chosen": 10.5, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 6.8125, "logps/rejected": -556.0, "logps/chosen": -444.0, "logits/rejected": 0.111328125, "logits/chosen": 0.09765625, "nll_loss": 0.75390625, "epoch": 2.080808080808081, "step": 50}, {"loss": 0.769140625, "grad_norm": 0.06960454571278732, "learning_rate": 6.09340545603188e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.045954, "rewards/chosen": 11.0, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 6.96875, "logps/rejected": -480.0, "logps/chosen": -468.0, "logits/rejected": -0.06884765625, "logits/chosen": 0.1708984375, "nll_loss": 0.76171875, "epoch": 2.282828282828283, "step": 55}, {"loss": 0.79381103515625, "grad_norm": 0.11142613870327374, "learning_rate": 5.4128967273616625e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046182, "rewards/chosen": 11.375, "rewards/rejected": 4.0, "rewards/accuracies": 1.0, "rewards/margins": 7.375, "logps/rejected": -492.0, "logps/chosen": -502.0, "logits/rejected": 0.00433349609375, "logits/chosen": 0.185546875, "nll_loss": 0.7265625, "epoch": 2.484848484848485, "step": 60}, {"eval_loss": 0.64990234375, "eval_runtime": 2.3346, "eval_samples_per_second": 1.713, "eval_steps_per_second": 0.857, "eval_rewards/chosen": 11.75, "eval_rewards/rejected": 4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.25, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1176.0, "eval_logits/rejected": -1.0703125, "eval_logits/chosen": 0.2265625, "eval_nll_loss": 0.83203125, "epoch": 2.484848484848485, "step": 60}, {"loss": 0.801824951171875, "grad_norm": 0.06831207602075139, "learning_rate": 4.7245611982206724e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046252, "rewards/chosen": 11.4375, "rewards/rejected": 4.46875, "rewards/accuracies": 1.0, "rewards/margins": 7.0, "logps/rejected": -664.0, "logps/chosen": -560.0, "logits/rejected": -0.07470703125, "logits/chosen": -0.01300048828125, "nll_loss": 0.7578125, "epoch": 2.686868686868687, "step": 65}, {"loss": 0.6947265625, "grad_norm": 0.0695294288435998, "learning_rate": 4.0414468403813095e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046346, "rewards/chosen": 11.375, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 7.125, "logps/rejected": -632.0, "logps/chosen": -360.0, "logits/rejected": 0.322265625, "logits/chosen": 0.080078125, "nll_loss": 0.66796875, "epoch": 2.888888888888889, "step": 70}, {"loss": 0.855328369140625, "grad_norm": 0.07630483079111013, "learning_rate": 3.3765026539765834e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046132, "rewards/chosen": 11.875, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 7.71875, "logps/rejected": -544.0, "logps/chosen": -536.0, "logits/rejected": -0.04052734375, "logits/chosen": 0.09912109375, "nll_loss": 0.70703125, "epoch": 3.121212121212121, "step": 75}, {"loss": 0.715185546875, "grad_norm": 0.056810731843726946, "learning_rate": 2.7423332084455544e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046264, "rewards/chosen": 11.75, "rewards/rejected": 4.1875, "rewards/accuracies": 1.0, "rewards/margins": 7.59375, "logps/rejected": -446.0, "logps/chosen": -504.0, "logits/rejected": -0.10302734375, "logits/chosen": 0.21484375, "nll_loss": 0.70703125, "epoch": 3.323232323232323, "step": 80}, {"eval_loss": 0.64111328125, "eval_runtime": 2.5228, "eval_samples_per_second": 1.586, "eval_steps_per_second": 0.793, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.0390625, "eval_logits/chosen": 0.275390625, "eval_nll_loss": 0.8203125, "epoch": 3.323232323232323, "step": 80}, {"loss": 0.755145263671875, "grad_norm": 0.0661057850452002, "learning_rate": 2.150959712448669e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046287, "rewards/chosen": 12.25, "rewards/rejected": 4.09375, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -608.0, "logps/chosen": -608.0, "logits/rejected": 0.0966796875, "logits/chosen": 0.1923828125, "nll_loss": 0.80859375, "epoch": 3.525252525252525, "step": 85}, {"loss": 0.677581787109375, "grad_norm": 0.06714498393054905, "learning_rate": 1.6135921418712956e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046375, "rewards/chosen": 11.6875, "rewards/rejected": 3.8125, "rewards/accuracies": 1.0, "rewards/margins": 7.90625, "logps/rejected": -576.0, "logps/chosen": -434.0, "logits/rejected": 0.09423828125, "logits/chosen": 0.10107421875, "nll_loss": 0.6484375, "epoch": 3.7272727272727275, "step": 90}, {"loss": 0.747625732421875, "grad_norm": 0.07216333845405633, "learning_rate": 1.1404167454183957e-05, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046466, "rewards/chosen": 11.3125, "rewards/rejected": 3.515625, "rewards/accuracies": 1.0, "rewards/margins": 7.75, "logps/rejected": -632.0, "logps/chosen": -368.0, "logits/rejected": 0.1484375, "logits/chosen": 0.059814453125, "nll_loss": 0.7109375, "epoch": 3.929292929292929, "step": 95}, {"loss": 0.82763671875, "grad_norm": 0.0693964252521182, "learning_rate": 7.404029558083653e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04631, "rewards/chosen": 11.5625, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -608.0, "logps/chosen": -364.0, "logits/rejected": 0.2236328125, "logits/chosen": 9.000301361083984e-06, "nll_loss": 0.65625, "epoch": 4.161616161616162, "step": 100}, {"eval_loss": 0.63623046875, "eval_runtime": 2.7031, "eval_samples_per_second": 1.48, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.125, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.287109375, "eval_nll_loss": 0.8125, "epoch": 4.161616161616162, "step": 100}, {"loss": 0.71771240234375, "grad_norm": 0.08188869975605054, "learning_rate": 4.2113336672471245e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.0463, "rewards/chosen": 12.25, "rewards/rejected": 4.46875, "rewards/accuracies": 1.0, "rewards/margins": 7.75, "logps/rejected": -540.0, "logps/chosen": -422.0, "logits/rejected": 0.061279296875, "logits/chosen": 0.06640625, "nll_loss": 0.67578125, "epoch": 4.363636363636363, "step": 105}, {"loss": 0.81170654296875, "grad_norm": 0.06339462001786819, "learning_rate": 1.8865999845374793e-06, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046346, "rewards/chosen": 12.5625, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 8.5, "logps/rejected": -860.0, "logps/chosen": -712.0, "logits/rejected": 0.138671875, "logits/chosen": 0.140625, "nll_loss": 0.875, "epoch": 4.565656565656566, "step": 110}, {"loss": 0.81134033203125, "grad_norm": 0.12700913709178566, "learning_rate": 4.738957681248379e-07, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.04641, "rewards/chosen": 12.125, "rewards/rejected": 3.96875, "rewards/accuracies": 1.0, "rewards/margins": 8.1875, "logps/rejected": -580.0, "logps/chosen": -588.0, "logits/rejected": -0.10791015625, "logits/chosen": 0.1669921875, "nll_loss": 0.8515625, "epoch": 4.767676767676767, "step": 115}, {"loss": 0.65350341796875, "grad_norm": 0.07997734087252309, "learning_rate": 0.0, "memory(GiB)": 25.88, "train_speed(iter/s)": 0.046432, "rewards/chosen": 12.6875, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 8.9375, "logps/rejected": -704.0, "logps/chosen": -572.0, "logits/rejected": 0.10791015625, "logits/chosen": 0.2255859375, "nll_loss": 0.70703125, "epoch": 4.96969696969697, "step": 120}, {"eval_loss": 0.634765625, "eval_runtime": 2.2615, "eval_samples_per_second": 1.769, "eval_steps_per_second": 0.884, "eval_rewards/chosen": 12.625, "eval_rewards/rejected": 4.375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.1875, "eval_logps/rejected": -348.0, "eval_logps/chosen": -1168.0, "eval_logits/rejected": -1.03125, "eval_logits/chosen": 0.29296875, "eval_nll_loss": 0.81640625, "epoch": 4.96969696969697, "step": 120}, {"train_runtime": 2583.6181, "train_samples_per_second": 0.766, "train_steps_per_second": 0.046, "total_flos": 11174281019392.0, "train_loss": 1.0072649637858073, "epoch": 4.96969696969697, "step": 120}], "memory": 25.8828125} diff --git a/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs/events.out.tfevents.1737747472.kml-dtmachine-18088-prod.48977.0 b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs/events.out.tfevents.1737747472.kml-dtmachine-18088-prod.48977.0 new file mode 100644 index 0000000000000000000000000000000000000000..6fa1d9ecfb7d712c7d0fe2b081803599749f0e45 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-193716/runs/events.out.tfevents.1737747472.kml-dtmachine-18088-prod.48977.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67843880198cbb2d37ca53f700593bbe2e8c41fe80e4529a75d8384eef465c2 +size 33762 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53c461d5f3a85edc90323cd3440b1f3226e00015 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb269636f75387fb4f207fc2f74386393b9641690a98421baaa0846e9b5cd554 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d761f897c595b558d1ad7bb33414dc847e90362 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c68cd1ba8ee955d7e3d3a5649101ceab28dd06a81da375959a263d25ff15b25 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff58ad95369e9de9ed61dfa2d66090bb28448788 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c567688ead8a7c31794dd782717b302054f77a5b4ed9dd7ecac79677f7f29e +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a43cd493bfff38acaa692b61da877d08808c8d03 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e06976ef4c3c32acc07db90f2553b5c853042b8ca1a43ed089e4b947ac8238 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9565d4714cc1d67720e7184ba2c456961eb94fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74075f57211a8dfa6b22041d44272db794a95351cb4b97b06c931f4957df6572 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cd23a72b89802d26bb56d323b59455e93f4e647e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.453125, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.0804463784398531, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.578125, + "logps/chosen": -376.0, + "logps/rejected": -458.0, + "loss": 0.65897216796875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 6.84375, + "rewards/rejected": 5.21875, + "step": 25, + "train_speed(iter/s)": 0.025148 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.07518236632054026, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.6484375, + "logps/chosen": -370.0, + "logps/rejected": -470.0, + "loss": 0.56932373046875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 9.875, + "rewards/rejected": 4.03125, + "step": 30, + "train_speed(iter/s)": 0.025393 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06767411898397745, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5546875, + "logps/chosen": -394.0, + "logps/rejected": -394.0, + "loss": 0.557476806640625, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.375, + "rewards/rejected": 4.375, + "step": 35, + "train_speed(iter/s)": 0.025333 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.047636534461983134, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.671875, + "logps/chosen": -368.0, + "logps/rejected": -478.0, + "loss": 0.5288330078125, + "memory(GiB)": 54.89, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 10.3125, + "rewards/rejected": 4.375, + "step": 40, + "train_speed(iter/s)": 0.025528 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6953125, + "eval_logits/rejected": -1.4375, + "eval_logps/chosen": -784.0, + "eval_logps/rejected": -236.0, + "eval_loss": 0.46875, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.0, + "eval_rewards/margins": 12.25, + "eval_rewards/rejected": 4.8125, + "eval_runtime": 4.1663, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.053771621820461274, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.6171875, + "logits/rejected": -1.7109375, + "logps/chosen": -292.0, + "logps/rejected": -540.0, + "loss": 0.5459747314453125, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.9375, + "rewards/rejected": 4.9375, + "step": 45, + "train_speed(iter/s)": 0.025633 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.0556754575357442, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -298.0, + "logps/rejected": -424.0, + "loss": 0.555657958984375, + "memory(GiB)": 54.89, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 10.1875, + "rewards/rejected": 4.78125, + "step": 50, + "train_speed(iter/s)": 0.025504 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03050128350589889, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.546875, + "logps/chosen": -340.0, + "logps/rejected": -392.0, + "loss": 0.5152923583984375, + "memory(GiB)": 54.89, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 10.875, + "rewards/rejected": 4.59375, + "step": 55, + "train_speed(iter/s)": 0.025525 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.03973909846789092, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.53125, + "logps/chosen": -356.0, + "logps/rejected": -384.0, + "loss": 0.51326904296875, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.625, + "rewards/rejected": 4.875, + "step": 60, + "train_speed(iter/s)": 0.025583 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -237.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.625, + "eval_rewards/rejected": 4.65625, + "eval_runtime": 4.123, + "eval_samples_per_second": 0.97, + "eval_steps_per_second": 0.485, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06262824842411734, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6171875, + "logps/chosen": -354.0, + "logps/rejected": -504.0, + "loss": 0.52603759765625, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.8125, + "rewards/margins": 10.875, + "rewards/rejected": 4.875, + "step": 65, + "train_speed(iter/s)": 0.025605 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0481159192704273, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.5, + "logits/rejected": -1.6484375, + "logps/chosen": -251.0, + "logps/rejected": -544.0, + "loss": 0.48192138671875, + "memory(GiB)": 56.74, + "nll_loss": 0.474609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.6875, + "rewards/rejected": 4.0625, + "step": 70, + "train_speed(iter/s)": 0.025704 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.04995228118845036, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.4609375, + "logps/chosen": -364.0, + "logps/rejected": -420.0, + "loss": 0.5461639404296875, + "memory(GiB)": 56.74, + "nll_loss": 0.455078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.0, + "rewards/margins": 11.9375, + "rewards/rejected": 5.09375, + "step": 75, + "train_speed(iter/s)": 0.025622 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04248238771472134, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.515625, + "logps/chosen": -344.0, + "logps/rejected": -364.0, + "loss": 0.46167449951171874, + "memory(GiB)": 56.74, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.9375, + "rewards/rejected": 4.59375, + "step": 80, + "train_speed(iter/s)": 0.025696 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.3359375, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -239.0, + "eval_loss": 0.455078125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.875, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 4.274, + "eval_samples_per_second": 0.936, + "eval_steps_per_second": 0.468, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.053616530504593876, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -1.65625, + "logits/rejected": -1.578125, + "logps/chosen": -396.0, + "logps/rejected": -488.0, + "loss": 0.4924560546875, + "memory(GiB)": 56.74, + "nll_loss": 0.5234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 13.125, + "rewards/rejected": 4.21875, + "step": 85, + "train_speed(iter/s)": 0.025636 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.04902185513812503, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.5234375, + "logps/chosen": -298.0, + "logps/rejected": -478.0, + "loss": 0.4585693359375, + "memory(GiB)": 56.74, + "nll_loss": 0.455078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 12.5, + "rewards/rejected": 3.8125, + "step": 90, + "train_speed(iter/s)": 0.025692 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.051170579618764526, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -1.65625, + "logits/rejected": -1.625, + "logps/chosen": -253.0, + "logps/rejected": -502.0, + "loss": 0.504632568359375, + "memory(GiB)": 56.74, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.3125, + "rewards/rejected": 4.25, + "step": 95, + "train_speed(iter/s)": 0.025745 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.05141718150059864, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -1.5859375, + "logits/rejected": -1.6953125, + "logps/chosen": -247.0, + "logps/rejected": -498.0, + "loss": 0.530096435546875, + "memory(GiB)": 56.74, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 11.0625, + "rewards/rejected": 3.9375, + "step": 100, + "train_speed(iter/s)": 0.025709 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -1.5859375, + "eval_logits/rejected": -1.328125, + "eval_logps/chosen": -768.0, + "eval_logps/rejected": -240.0, + "eval_loss": 0.453125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.625, + "eval_rewards/margins": 14.25, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 3.8342, + "eval_samples_per_second": 1.043, + "eval_steps_per_second": 0.522, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 54196415234048.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..74f3bfc3b14ca5e78d520e8546a7e1a72fa3639c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a2f499f3e92b27d8ba4e762026d1d5a12648c1a155150eea1650d609975ea7 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cef84fe3f777da61548aebb9ddc0ee5cd6c15d0c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349b43a4c9b1c8b37720078e372f9db6a50b5c963b9094dd4450b37f647a2767 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad697d91034b50380392280b39319d1e9a1908f6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ccb90c21900941b268980002c87ca8648269fd8ebdedde4545f7c14a39eeb5 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07e49e6a39e634ef967b0cdb08d1abd68382ab16 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f261bc165e44a0fc58fc535ebc035e51f6ecdd266757157105df194e4a4cdde +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f8a60f4f806b79bd8d88f6131442596235d203 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf51e090971937103ddd1c3f612aaa557cbf4ecbdcf1e2d1c0012c06900b9169 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..1ff406405418d84068458850f74aecfc6224f793 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/latest @@ -0,0 +1 @@ +global_step122 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a49f44ba05d98a84fd55c18c4fa41c6437c8853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..79ef7e8924723bd699efa313eb78103d80b7edb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40007a79aad967206b797079ca5147beff46ee1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede1043a0735266b510faa06f578fa6ef180c11e994a142a88a13ac6f33eb78b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcd131fa6ff7638c96434bdf2eb4e34174939f3e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.45166016, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120", + "epoch": 4.96969696969697, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.0804463784398531, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.578125, + "logps/chosen": -376.0, + "logps/rejected": -458.0, + "loss": 0.65897216796875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 6.84375, + "rewards/rejected": 5.21875, + "step": 25, + "train_speed(iter/s)": 0.025148 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.07518236632054026, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.6484375, + "logps/chosen": -370.0, + "logps/rejected": -470.0, + "loss": 0.56932373046875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 9.875, + "rewards/rejected": 4.03125, + "step": 30, + "train_speed(iter/s)": 0.025393 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06767411898397745, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5546875, + "logps/chosen": -394.0, + "logps/rejected": -394.0, + "loss": 0.557476806640625, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.375, + "rewards/rejected": 4.375, + "step": 35, + "train_speed(iter/s)": 0.025333 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.047636534461983134, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.671875, + "logps/chosen": -368.0, + "logps/rejected": -478.0, + "loss": 0.5288330078125, + "memory(GiB)": 54.89, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 10.3125, + "rewards/rejected": 4.375, + "step": 40, + "train_speed(iter/s)": 0.025528 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6953125, + "eval_logits/rejected": -1.4375, + "eval_logps/chosen": -784.0, + "eval_logps/rejected": -236.0, + "eval_loss": 0.46875, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.0, + "eval_rewards/margins": 12.25, + "eval_rewards/rejected": 4.8125, + "eval_runtime": 4.1663, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.053771621820461274, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.6171875, + "logits/rejected": -1.7109375, + "logps/chosen": -292.0, + "logps/rejected": -540.0, + "loss": 0.5459747314453125, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.9375, + "rewards/rejected": 4.9375, + "step": 45, + "train_speed(iter/s)": 0.025633 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.0556754575357442, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -298.0, + "logps/rejected": -424.0, + "loss": 0.555657958984375, + "memory(GiB)": 54.89, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 10.1875, + "rewards/rejected": 4.78125, + "step": 50, + "train_speed(iter/s)": 0.025504 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03050128350589889, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.546875, + "logps/chosen": -340.0, + "logps/rejected": -392.0, + "loss": 0.5152923583984375, + "memory(GiB)": 54.89, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 10.875, + "rewards/rejected": 4.59375, + "step": 55, + "train_speed(iter/s)": 0.025525 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.03973909846789092, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.53125, + "logps/chosen": -356.0, + "logps/rejected": -384.0, + "loss": 0.51326904296875, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.625, + "rewards/rejected": 4.875, + "step": 60, + "train_speed(iter/s)": 0.025583 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -237.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.625, + "eval_rewards/rejected": 4.65625, + "eval_runtime": 4.123, + "eval_samples_per_second": 0.97, + "eval_steps_per_second": 0.485, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06262824842411734, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6171875, + "logps/chosen": -354.0, + "logps/rejected": -504.0, + "loss": 0.52603759765625, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.8125, + "rewards/margins": 10.875, + "rewards/rejected": 4.875, + "step": 65, + "train_speed(iter/s)": 0.025605 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0481159192704273, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.5, + "logits/rejected": -1.6484375, + "logps/chosen": -251.0, + "logps/rejected": -544.0, + "loss": 0.48192138671875, + "memory(GiB)": 56.74, + "nll_loss": 0.474609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.6875, + "rewards/rejected": 4.0625, + "step": 70, + "train_speed(iter/s)": 0.025704 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.04995228118845036, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.4609375, + "logps/chosen": -364.0, + "logps/rejected": -420.0, + "loss": 0.5461639404296875, + "memory(GiB)": 56.74, + "nll_loss": 0.455078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.0, + "rewards/margins": 11.9375, + "rewards/rejected": 5.09375, + "step": 75, + "train_speed(iter/s)": 0.025622 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04248238771472134, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.515625, + "logps/chosen": -344.0, + "logps/rejected": -364.0, + "loss": 0.46167449951171874, + "memory(GiB)": 56.74, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.9375, + "rewards/rejected": 4.59375, + "step": 80, + "train_speed(iter/s)": 0.025696 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.3359375, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -239.0, + "eval_loss": 0.455078125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.875, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 4.274, + "eval_samples_per_second": 0.936, + "eval_steps_per_second": 0.468, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.053616530504593876, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -1.65625, + "logits/rejected": -1.578125, + "logps/chosen": -396.0, + "logps/rejected": -488.0, + "loss": 0.4924560546875, + "memory(GiB)": 56.74, + "nll_loss": 0.5234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 13.125, + "rewards/rejected": 4.21875, + "step": 85, + "train_speed(iter/s)": 0.025636 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.04902185513812503, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.5234375, + "logps/chosen": -298.0, + "logps/rejected": -478.0, + "loss": 0.4585693359375, + "memory(GiB)": 56.74, + "nll_loss": 0.455078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 12.5, + "rewards/rejected": 3.8125, + "step": 90, + "train_speed(iter/s)": 0.025692 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.051170579618764526, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -1.65625, + "logits/rejected": -1.625, + "logps/chosen": -253.0, + "logps/rejected": -502.0, + "loss": 0.504632568359375, + "memory(GiB)": 56.74, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.3125, + "rewards/rejected": 4.25, + "step": 95, + "train_speed(iter/s)": 0.025745 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.05141718150059864, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -1.5859375, + "logits/rejected": -1.6953125, + "logps/chosen": -247.0, + "logps/rejected": -498.0, + "loss": 0.530096435546875, + "memory(GiB)": 56.74, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 11.0625, + "rewards/rejected": 3.9375, + "step": 100, + "train_speed(iter/s)": 0.025709 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -1.5859375, + "eval_logits/rejected": -1.328125, + "eval_logps/chosen": -768.0, + "eval_logps/rejected": -240.0, + "eval_loss": 0.453125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.625, + "eval_rewards/margins": 14.25, + "eval_rewards/rejected": 4.40625, + "eval_runtime": 3.8342, + "eval_samples_per_second": 1.043, + "eval_steps_per_second": 0.522, + "step": 100 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.06586560796758323, + "learning_rate": 4.2113336672471245e-06, + "logits/chosen": -1.5703125, + "logits/rejected": -1.5390625, + "logps/chosen": -280.0, + "logps/rejected": -450.0, + "loss": 0.47074127197265625, + "memory(GiB)": 56.74, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 12.1875, + "rewards/rejected": 3.890625, + "step": 105, + "train_speed(iter/s)": 0.025739 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 0.05873220563672292, + "learning_rate": 1.8865999845374793e-06, + "logits/chosen": -1.6640625, + "logits/rejected": -1.578125, + "logps/chosen": -418.0, + "logps/rejected": -596.0, + "loss": 0.5236541748046875, + "memory(GiB)": 56.74, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 12.25, + "rewards/rejected": 4.96875, + "step": 110, + "train_speed(iter/s)": 0.025778 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.10912918908815608, + "learning_rate": 4.738957681248379e-07, + "logits/chosen": -1.6484375, + "logits/rejected": -1.515625, + "logps/chosen": -374.0, + "logps/rejected": -440.0, + "loss": 0.5196258544921875, + "memory(GiB)": 56.74, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 12.75, + "rewards/rejected": 4.59375, + "step": 115, + "train_speed(iter/s)": 0.025827 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.06404241447513843, + "learning_rate": 0.0, + "logits/chosen": -1.546875, + "logits/rejected": -1.453125, + "logps/chosen": -386.0, + "logps/rejected": -536.0, + "loss": 0.44427337646484377, + "memory(GiB)": 56.74, + "nll_loss": 0.47265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.625, + "rewards/margins": 12.4375, + "rewards/rejected": 5.15625, + "step": 120, + "train_speed(iter/s)": 0.025823 + }, + { + "epoch": 4.96969696969697, + "eval_logits/chosen": -1.578125, + "eval_logits/rejected": -1.3125, + "eval_logps/chosen": -768.0, + "eval_logps/rejected": -239.0, + "eval_loss": 0.45166015625, + "eval_nll_loss": 0.54296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.75, + "eval_rewards/margins": 14.25, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 3.9949, + "eval_samples_per_second": 1.001, + "eval_steps_per_second": 0.501, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 64868340203520.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9b238fa3c9d2c17c54d04393c7567b0b06821f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c121a797e72148d6bbc7efb56e13109c47ff2bfdaace8345ddc4a4a9597b9f +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4209bc35324206ae0accf92a252677f13a135ae2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecd6b756c9009b037251d0dd640b9f34a8de306706a0335e71ccc9ebc249fe9d +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ddc1e55e1334897a093007e7ccbca3e5fa1339e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dfe001865b87594c3f61d6f790e2a850bfcec9856c5e1dabd60730d21584e21 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17cea2764a01871f6a1a26b88474fb83432e2342 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd8d20e0daa19f3da74a53ded05c8904aab682db343ed38f6419a9e8b219d47 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13dbdb5c086a404044eed7d3073796d64d71c67d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4351eebfddfbc95fb4f0c31aa99c4669a970f5d1dd67f76652cef5195f669754 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..485a921542e29aa1d8fadf43d6e1e7e8eb41de5f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.52001953, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 11140054777856.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48c791a3835be82b9e28c7ab33f9a022c456c9d1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8bfa818f62ba9bda51c737e6f39031aeec954e82f5c61171b925924a6279ae8 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2933bd29b56055432a086ad20c8ab01efcfcb2f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d611777417ebd01a0998880a7ea68514eb4563f9bdfd9423acd4b36eb2d0a0 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81e2ae69966a12fddd46ff5cd2e827e68f291939 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d732b53a0b420e1510844c6316b573ee91f9979d27fd79295c96f816a8fec1b1 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..200e6b2143a228c693b631a71e9f3f8cd613d916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f261a0a7746bfdfbe379d61c4b9a1a3101c15fb35a7727443bb97548aa3453 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f92a5227f433763962cf4fe746c5f3b9fc1e078 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a53d8136ed04e678c2604a1b84bd5830e6503d57715c1347a37f71f219591523 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2bfeaf22080c81f519029cf83381148eceb3628a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.46875, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.0804463784398531, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.578125, + "logps/chosen": -376.0, + "logps/rejected": -458.0, + "loss": 0.65897216796875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 6.84375, + "rewards/rejected": 5.21875, + "step": 25, + "train_speed(iter/s)": 0.025148 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.07518236632054026, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.6484375, + "logps/chosen": -370.0, + "logps/rejected": -470.0, + "loss": 0.56932373046875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 9.875, + "rewards/rejected": 4.03125, + "step": 30, + "train_speed(iter/s)": 0.025393 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06767411898397745, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5546875, + "logps/chosen": -394.0, + "logps/rejected": -394.0, + "loss": 0.557476806640625, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.375, + "rewards/rejected": 4.375, + "step": 35, + "train_speed(iter/s)": 0.025333 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.047636534461983134, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.671875, + "logps/chosen": -368.0, + "logps/rejected": -478.0, + "loss": 0.5288330078125, + "memory(GiB)": 54.89, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 10.3125, + "rewards/rejected": 4.375, + "step": 40, + "train_speed(iter/s)": 0.025528 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6953125, + "eval_logits/rejected": -1.4375, + "eval_logps/chosen": -784.0, + "eval_logps/rejected": -236.0, + "eval_loss": 0.46875, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.0, + "eval_rewards/margins": 12.25, + "eval_rewards/rejected": 4.8125, + "eval_runtime": 4.1663, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 22788815323136.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f1f067a00c58dd907f6ce489f094449473ae413 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c564597f2818ddf22d9a3537cd14a7c7455bd78c2f11fd7aaab7bba044200fe3 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e236d9fc3bc6a3834780bc20b6a35991b42ea565 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e56c5169f08b2d5772de2f8f8b36fb59785f634ad5e7d8ed205782e17b3b32 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc0a055aa939ce9b48043913f2e19f60501199ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ef63ea8040bd68c9043910c0efe390f584737644f8ccd14cda113e019f3cb6 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c33a8f6b932e36445db5646999b5b2be7158d41 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c8fcfc5a7e3a3997ccd385accb83d8a322b4cdd7020d7f9ec9dbf789f4948f +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8afdbe4e7073be1f1d59805bedc3570347b027a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4329013567adcc4dbe71958d8df58ceadb8514e5968a5074267338076d097d36 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e958331ea8e5ad7f6a1409066ff85b48664f4593 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.45703125, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.0804463784398531, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.578125, + "logps/chosen": -376.0, + "logps/rejected": -458.0, + "loss": 0.65897216796875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 6.84375, + "rewards/rejected": 5.21875, + "step": 25, + "train_speed(iter/s)": 0.025148 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.07518236632054026, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.6484375, + "logps/chosen": -370.0, + "logps/rejected": -470.0, + "loss": 0.56932373046875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 9.875, + "rewards/rejected": 4.03125, + "step": 30, + "train_speed(iter/s)": 0.025393 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06767411898397745, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5546875, + "logps/chosen": -394.0, + "logps/rejected": -394.0, + "loss": 0.557476806640625, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.375, + "rewards/rejected": 4.375, + "step": 35, + "train_speed(iter/s)": 0.025333 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.047636534461983134, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.671875, + "logps/chosen": -368.0, + "logps/rejected": -478.0, + "loss": 0.5288330078125, + "memory(GiB)": 54.89, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 10.3125, + "rewards/rejected": 4.375, + "step": 40, + "train_speed(iter/s)": 0.025528 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6953125, + "eval_logits/rejected": -1.4375, + "eval_logps/chosen": -784.0, + "eval_logps/rejected": -236.0, + "eval_loss": 0.46875, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.0, + "eval_rewards/margins": 12.25, + "eval_rewards/rejected": 4.8125, + "eval_runtime": 4.1663, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.053771621820461274, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.6171875, + "logits/rejected": -1.7109375, + "logps/chosen": -292.0, + "logps/rejected": -540.0, + "loss": 0.5459747314453125, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.9375, + "rewards/rejected": 4.9375, + "step": 45, + "train_speed(iter/s)": 0.025633 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.0556754575357442, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -298.0, + "logps/rejected": -424.0, + "loss": 0.555657958984375, + "memory(GiB)": 54.89, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 10.1875, + "rewards/rejected": 4.78125, + "step": 50, + "train_speed(iter/s)": 0.025504 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03050128350589889, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.546875, + "logps/chosen": -340.0, + "logps/rejected": -392.0, + "loss": 0.5152923583984375, + "memory(GiB)": 54.89, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 10.875, + "rewards/rejected": 4.59375, + "step": 55, + "train_speed(iter/s)": 0.025525 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.03973909846789092, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.53125, + "logps/chosen": -356.0, + "logps/rejected": -384.0, + "loss": 0.51326904296875, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.625, + "rewards/rejected": 4.875, + "step": 60, + "train_speed(iter/s)": 0.025583 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -237.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.625, + "eval_rewards/rejected": 4.65625, + "eval_runtime": 4.123, + "eval_samples_per_second": 0.97, + "eval_steps_per_second": 0.485, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 32612124196864.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df5da74cc546017e052d802896cc50af508ad8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c720e8d1326e3bfc7b13ddcf0f516b795bae8aa5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87ce652aaf209cb9dc3c946291128db8078f63bce080b973c01ec19219a7fc8 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7d859212a48668ac5a2befb6fba17d5a7fa639a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1666a818489c34e9f9ee20ff53715f96fab5779a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc9229aa1f5e561a870d9d3a1ab5d3c15ddceffa85b6f696f02a7ee2bbab6b59 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6574f49c79cd8a594003920b2328068e67866c1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31eed274f6db568340a64b1faa14cc9a0f61c590dd8545412644ea5b1e0ba208 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d8c7c4bf7c9f614a9deb6d953c0232042518406 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f51a19a37f1004f4021b66cc177a82f08af3abc5519c61ff90b124392a844b4 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68c431fa12e41f18e7ed8843913a786faa1e141b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fc6136fd667208aa14d49b962c185b4c43086df2120b8a85aae19aa237e0c9 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc4025015c60177fefa9b209ff2ccf2682a0dfda --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.45507812, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9959330518635272, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.71875, + "logps/chosen": -466.0, + "logps/rejected": -920.0, + "loss": 1.84228515625, + "memory(GiB)": 53.3, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014511 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 1.0006093513724832, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.71875, + "logits/rejected": -1.6015625, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.35357666015625, + "memory(GiB)": 53.3, + "nll_loss": 1.4140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0234375, + "rewards/margins": 0.0625, + "rewards/rejected": -0.0859375, + "step": 5, + "train_speed(iter/s)": 0.022719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.37515138030427864, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.46875, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 1.395166015625, + "memory(GiB)": 53.3, + "nll_loss": 0.9609375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8125, + "rewards/margins": 2.109375, + "rewards/rejected": -0.30078125, + "step": 10, + "train_speed(iter/s)": 0.024642 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.365020087187755, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.671875, + "logps/chosen": -458.0, + "logps/rejected": -516.0, + "loss": 1.206640625, + "memory(GiB)": 53.43, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1875, + "rewards/margins": 5.4375, + "rewards/rejected": 0.76171875, + "step": 15, + "train_speed(iter/s)": 0.025293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.10470765382319668, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.859375, + "logps/chosen": -294.0, + "logps/rejected": -466.0, + "loss": 0.68890380859375, + "memory(GiB)": 53.43, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 5.71875, + "rewards/rejected": 4.25, + "step": 20, + "train_speed(iter/s)": 0.025626 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.71875, + "eval_logits/rejected": -1.46875, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.52001953125, + "eval_nll_loss": 0.60546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 8.75, + "eval_rewards/rejected": 5.4375, + "eval_runtime": 4.2519, + "eval_samples_per_second": 0.941, + "eval_steps_per_second": 0.47, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.0804463784398531, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.578125, + "logps/chosen": -376.0, + "logps/rejected": -458.0, + "loss": 0.65897216796875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 6.84375, + "rewards/rejected": 5.21875, + "step": 25, + "train_speed(iter/s)": 0.025148 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.07518236632054026, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.6484375, + "logps/chosen": -370.0, + "logps/rejected": -470.0, + "loss": 0.56932373046875, + "memory(GiB)": 54.89, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 9.875, + "rewards/rejected": 4.03125, + "step": 30, + "train_speed(iter/s)": 0.025393 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06767411898397745, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5546875, + "logps/chosen": -394.0, + "logps/rejected": -394.0, + "loss": 0.557476806640625, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.375, + "rewards/rejected": 4.375, + "step": 35, + "train_speed(iter/s)": 0.025333 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.047636534461983134, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.671875, + "logps/chosen": -368.0, + "logps/rejected": -478.0, + "loss": 0.5288330078125, + "memory(GiB)": 54.89, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 10.3125, + "rewards/rejected": 4.375, + "step": 40, + "train_speed(iter/s)": 0.025528 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6953125, + "eval_logits/rejected": -1.4375, + "eval_logps/chosen": -784.0, + "eval_logps/rejected": -236.0, + "eval_loss": 0.46875, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.0, + "eval_rewards/margins": 12.25, + "eval_rewards/rejected": 4.8125, + "eval_runtime": 4.1663, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.053771621820461274, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.6171875, + "logits/rejected": -1.7109375, + "logps/chosen": -292.0, + "logps/rejected": -540.0, + "loss": 0.5459747314453125, + "memory(GiB)": 54.89, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.9375, + "rewards/rejected": 4.9375, + "step": 45, + "train_speed(iter/s)": 0.025633 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.0556754575357442, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -298.0, + "logps/rejected": -424.0, + "loss": 0.555657958984375, + "memory(GiB)": 54.89, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 10.1875, + "rewards/rejected": 4.78125, + "step": 50, + "train_speed(iter/s)": 0.025504 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03050128350589889, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.546875, + "logps/chosen": -340.0, + "logps/rejected": -392.0, + "loss": 0.5152923583984375, + "memory(GiB)": 54.89, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 10.875, + "rewards/rejected": 4.59375, + "step": 55, + "train_speed(iter/s)": 0.025525 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.03973909846789092, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.53125, + "logps/chosen": -356.0, + "logps/rejected": -384.0, + "loss": 0.51326904296875, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.625, + "rewards/rejected": 4.875, + "step": 60, + "train_speed(iter/s)": 0.025583 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -237.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.625, + "eval_rewards/rejected": 4.65625, + "eval_runtime": 4.123, + "eval_samples_per_second": 0.97, + "eval_steps_per_second": 0.485, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.06262824842411734, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6171875, + "logps/chosen": -354.0, + "logps/rejected": -504.0, + "loss": 0.52603759765625, + "memory(GiB)": 54.89, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.8125, + "rewards/margins": 10.875, + "rewards/rejected": 4.875, + "step": 65, + "train_speed(iter/s)": 0.025605 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.0481159192704273, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.5, + "logits/rejected": -1.6484375, + "logps/chosen": -251.0, + "logps/rejected": -544.0, + "loss": 0.48192138671875, + "memory(GiB)": 56.74, + "nll_loss": 0.474609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 10.6875, + "rewards/rejected": 4.0625, + "step": 70, + "train_speed(iter/s)": 0.025704 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.04995228118845036, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.4609375, + "logps/chosen": -364.0, + "logps/rejected": -420.0, + "loss": 0.5461639404296875, + "memory(GiB)": 56.74, + "nll_loss": 0.455078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.0, + "rewards/margins": 11.9375, + "rewards/rejected": 5.09375, + "step": 75, + "train_speed(iter/s)": 0.025622 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04248238771472134, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.515625, + "logps/chosen": -344.0, + "logps/rejected": -364.0, + "loss": 0.46167449951171874, + "memory(GiB)": 56.74, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 11.9375, + "rewards/rejected": 4.59375, + "step": 80, + "train_speed(iter/s)": 0.025696 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.3359375, + "eval_logps/chosen": -772.0, + "eval_logps/rejected": -239.0, + "eval_loss": 0.455078125, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 18.25, + "eval_rewards/margins": 13.875, + "eval_rewards/rejected": 4.4375, + "eval_runtime": 4.274, + "eval_samples_per_second": 0.936, + "eval_steps_per_second": 0.468, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 43131019722752.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc02ec60ccbe1d82119b08f8f2a15e7027a1a6be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922cf044a9f455372643d7da783ad8b791f0e5e284707238663331a9af802319 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..04e69d70dda064882135f0ef5b5ece515507fb82 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..25a97cabd9bfb6515ba88598143485001313ac62 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..dc755d46bc7711c97a951469715d4d3dcf013864 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..6c680aa4fd754e45d66950bba12327f44f6c8db7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..915780049267cecf48c3f0a27d166b0e33787f30 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a0e1718850fb17a621b6c01a695262ba02188f2f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..8cead5875b75d617a13f1247e15b17c39a5168b6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..c22be77b0ac026dec5e849d31ae5f8ae91842350 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..5530f2c5e514365b665cd7434a92b7f021303345 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1aa54eacdf81014524ebc930bba7fdd78f109e5c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..6317a3ba3fcd25e9c48087e2c895a71afea7134a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..0eaaa265d7cca69bade0afe654f150478abc47ef Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a900cff9dc262f927de587c40593df235bed390c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..2c72126dce5328d7e5cbf3f3ba730b04185f2ff2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..40f49001ff2bc7e05f35248cc8fee20849eea0bb Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..a138c3207804ef5be69dc8ff0d41e1abf82a564e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..b89ce30817956df861ba64aaf1630c521e827fee Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..99a50b42e32027a29c0e4abf7ed5b39b2e1bdb8e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e85fe72727470bb42a15468aa23bc85f4b874e5e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ce5f5c2e164657d3568def24f6a5c1e032c3327e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..117c09a0297025a1d27321829a00d75c1fb67c25 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..14ff3e69d30fcdebee8c85ec55bb425d12ca3dd1 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..66e1e281659fafa433e3b1ce25045ff0f1df1bf7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..55d867d7bb2949422c9cc53603b993201e4b1d11 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..1f84150f392c931ee81e5ef704cce5948c28d233 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..f50dc353d605ed163335f947c6b457b4c85b4d06 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..e2f4eab6c6d0dcbc63d5d593832aa4ae64c72eeb Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..0d899eba4c33fe96c90b917c4166c18e53739670 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..17c5ac67da69830852033b843168df2985cfa307 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..95d6269736d540b488e54d8188c2a8c6463a56fd Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..f418fbe7d9fdb77bb7362da5fa0c5f814639f6df Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..e583b26472ec0193188607eba093c1960c7e085f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..836fc6ae43413cd867a5d01e5502139388d64a57 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/logging.jsonl b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93b9e9d651e4140de00c6ed1ff1189682f33a803 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/logging.jsonl @@ -0,0 +1,33 @@ +{"loss": 1.84228516, "grad_norm": 0.99593305, "learning_rate": 1.667e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.014511, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -920.0, "logps/chosen": -466.0, "logits/rejected": -1.71875, "logits/chosen": -1.9375, "nll_loss": 1.4765625, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "55s", "remaining_time": "1h 51m 0s"} +{"loss": 2.35357666, "grad_norm": 1.00060935, "learning_rate": 8.333e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.022719, "rewards/chosen": -0.0234375, "rewards/rejected": -0.0859375, "rewards/accuracies": 0.28125, "rewards/margins": 0.0625, "logps/rejected": -552.0, "logps/chosen": -576.0, "logits/rejected": -1.6015625, "logits/chosen": -1.71875, "nll_loss": 1.4140625, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "3m 27s", "remaining_time": "1h 19m 24s"} +{"loss": 1.39516602, "grad_norm": 0.37515138, "learning_rate": 9.97e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.024642, "rewards/chosen": 1.8125, "rewards/rejected": -0.30078125, "rewards/accuracies": 0.92500001, "rewards/margins": 2.109375, "logps/rejected": -496.0, "logps/chosen": -506.0, "logits/rejected": -1.46875, "logits/chosen": -1.7578125, "nll_loss": 0.9609375, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "6m 32s", "remaining_time": "1h 12m 1s"} +{"loss": 1.20664062, "grad_norm": 0.36502009, "learning_rate": 9.847e-05, "memory(GiB)": 53.43, "train_speed(iter/s)": 0.025293, "rewards/chosen": 6.1875, "rewards/rejected": 0.76171875, "rewards/accuracies": 1.0, "rewards/margins": 5.4375, "logps/rejected": -516.0, "logps/chosen": -458.0, "logits/rejected": -1.671875, "logits/chosen": -1.9140625, "nll_loss": 1.1015625, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "9m 40s", "remaining_time": "1h 7m 40s"} +{"loss": 0.68890381, "grad_norm": 0.10470765, "learning_rate": 9.632e-05, "memory(GiB)": 53.43, "train_speed(iter/s)": 0.025626, "rewards/chosen": 10.0, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 5.71875, "logps/rejected": -466.0, "logps/chosen": -294.0, "logits/rejected": -1.859375, "logits/chosen": -1.828125, "nll_loss": 0.6953125, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "12m 47s", "remaining_time": "1h 3m 57s"} +{"eval_loss": 0.52001953, "eval_runtime": 4.2519, "eval_samples_per_second": 0.941, "eval_steps_per_second": 0.47, "eval_rewards/chosen": 14.25, "eval_rewards/rejected": 5.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.75, "eval_logps/rejected": -229.0, "eval_logps/chosen": -812.0, "eval_logits/rejected": -1.46875, "eval_logits/chosen": -1.71875, "eval_nll_loss": 0.60546875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "12m 51s", "remaining_time": "1h 4m 18s"} +{"loss": 0.65897217, "grad_norm": 0.08044638, "learning_rate": 9.33e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025148, "rewards/chosen": 12.0625, "rewards/rejected": 5.21875, "rewards/accuracies": 1.0, "rewards/margins": 6.84375, "logps/rejected": -458.0, "logps/chosen": -376.0, "logits/rejected": -1.578125, "logits/chosen": -1.78125, "nll_loss": 0.609375, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "16m 21s", "remaining_time": "1h 2m 8s"} +{"loss": 0.56932373, "grad_norm": 0.07518237, "learning_rate": 8.946e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025393, "rewards/chosen": 13.9375, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 9.875, "logps/rejected": -470.0, "logps/chosen": -370.0, "logits/rejected": -1.6484375, "logits/chosen": -1.78125, "nll_loss": 0.609375, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "19m 28s", "remaining_time": "58m 25s"} +{"loss": 0.55747681, "grad_norm": 0.06767412, "learning_rate": 8.486e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025333, "rewards/chosen": 14.75, "rewards/rejected": 4.375, "rewards/accuracies": 1.0, "rewards/margins": 10.375, "logps/rejected": -394.0, "logps/chosen": -394.0, "logits/rejected": -1.5546875, "logits/chosen": -1.8203125, "nll_loss": 0.57421875, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "22m 48s", "remaining_time": "55m 23s"} +{"loss": 0.52883301, "grad_norm": 0.04763653, "learning_rate": 7.961e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025528, "rewards/chosen": 14.6875, "rewards/rejected": 4.375, "rewards/accuracies": 1.0, "rewards/margins": 10.3125, "logps/rejected": -478.0, "logps/chosen": -368.0, "logits/rejected": -1.671875, "logits/chosen": -1.75, "nll_loss": 0.55859375, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "25m 53s", "remaining_time": "51m 47s"} +{"eval_loss": 0.46875, "eval_runtime": 4.1663, "eval_samples_per_second": 0.96, "eval_steps_per_second": 0.48, "eval_rewards/chosen": 17.0, "eval_rewards/rejected": 4.8125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.25, "eval_logps/rejected": -236.0, "eval_logps/chosen": -784.0, "eval_logits/rejected": -1.4375, "eval_logits/chosen": -1.6953125, "eval_nll_loss": 0.5625, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "25m 58s", "remaining_time": "51m 56s"} +{"loss": 0.54597473, "grad_norm": 0.05377162, "learning_rate": 7.38e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025633, "rewards/chosen": 14.875, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 9.9375, "logps/rejected": -540.0, "logps/chosen": -292.0, "logits/rejected": -1.7109375, "logits/chosen": -1.6171875, "nll_loss": 0.57421875, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "29m 2s", "remaining_time": "48m 24s"} +{"loss": 0.55565796, "grad_norm": 0.05567546, "learning_rate": 6.753e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025504, "rewards/chosen": 14.9375, "rewards/rejected": 4.78125, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -424.0, "logps/chosen": -298.0, "logits/rejected": -1.5625, "logits/chosen": -1.609375, "nll_loss": 0.49023438, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "32m 27s", "remaining_time": "45m 26s"} +{"loss": 0.51529236, "grad_norm": 0.03050128, "learning_rate": 6.093e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025525, "rewards/chosen": 15.5, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -392.0, "logps/chosen": -340.0, "logits/rejected": -1.546875, "logits/chosen": -1.8046875, "nll_loss": 0.53125, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "35m 41s", "remaining_time": "42m 11s"} +{"loss": 0.51326904, "grad_norm": 0.0397391, "learning_rate": 5.413e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025583, "rewards/chosen": 16.5, "rewards/rejected": 4.875, "rewards/accuracies": 1.0, "rewards/margins": 11.625, "logps/rejected": -384.0, "logps/chosen": -356.0, "logits/rejected": -1.53125, "logits/chosen": -1.7578125, "nll_loss": 0.5, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "38m 52s", "remaining_time": "38m 52s"} +{"eval_loss": 0.45703125, "eval_runtime": 4.123, "eval_samples_per_second": 0.97, "eval_steps_per_second": 0.485, "eval_rewards/chosen": 18.25, "eval_rewards/rejected": 4.65625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.625, "eval_logps/rejected": -237.0, "eval_logps/chosen": -772.0, "eval_logits/rejected": -1.3671875, "eval_logits/chosen": -1.625, "eval_nll_loss": 0.546875, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "38m 56s", "remaining_time": "38m 56s"} +{"loss": 0.5260376, "grad_norm": 0.06262825, "learning_rate": 4.725e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025605, "rewards/chosen": 15.8125, "rewards/rejected": 4.875, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -504.0, "logps/chosen": -354.0, "logits/rejected": -1.6171875, "logits/chosen": -1.6328125, "nll_loss": 0.5, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "42m 5s", "remaining_time": "35m 37s"} +{"loss": 0.48192139, "grad_norm": 0.04811592, "learning_rate": 4.041e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025704, "rewards/chosen": 14.75, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -544.0, "logps/chosen": -251.0, "logits/rejected": -1.6484375, "logits/chosen": -1.5, "nll_loss": 0.47460938, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "45m 10s", "remaining_time": "32m 15s"} +{"loss": 0.54616394, "grad_norm": 0.04995228, "learning_rate": 3.377e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025622, "rewards/chosen": 17.0, "rewards/rejected": 5.09375, "rewards/accuracies": 1.0, "rewards/margins": 11.9375, "logps/rejected": -420.0, "logps/chosen": -364.0, "logits/rejected": -1.4609375, "logits/chosen": -1.7265625, "nll_loss": 0.45507812, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "48m 34s", "remaining_time": "29m 8s"} +{"loss": 0.4616745, "grad_norm": 0.04248239, "learning_rate": 2.742e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025696, "rewards/chosen": 16.5, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 11.9375, "logps/rejected": -364.0, "logps/chosen": -344.0, "logits/rejected": -1.515625, "logits/chosen": -1.703125, "nll_loss": 0.48242188, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "51m 40s", "remaining_time": "25m 50s"} +{"eval_loss": 0.45507812, "eval_runtime": 4.274, "eval_samples_per_second": 0.936, "eval_steps_per_second": 0.468, "eval_rewards/chosen": 18.25, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.875, "eval_logps/rejected": -239.0, "eval_logps/chosen": -772.0, "eval_logits/rejected": -1.3359375, "eval_logits/chosen": -1.609375, "eval_nll_loss": 0.546875, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "51m 44s", "remaining_time": "25m 52s"} +{"loss": 0.49245605, "grad_norm": 0.05361653, "learning_rate": 2.151e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025636, "rewards/chosen": 17.375, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 13.125, "logps/rejected": -488.0, "logps/chosen": -396.0, "logits/rejected": -1.578125, "logits/chosen": -1.65625, "nll_loss": 0.5234375, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "55m 2s", "remaining_time": "22m 39s"} +{"loss": 0.45856934, "grad_norm": 0.04902186, "learning_rate": 1.614e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025692, "rewards/chosen": 16.25, "rewards/rejected": 3.8125, "rewards/accuracies": 1.0, "rewards/margins": 12.5, "logps/rejected": -478.0, "logps/chosen": -298.0, "logits/rejected": -1.5234375, "logits/chosen": -1.578125, "nll_loss": 0.45507812, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "58m 10s", "remaining_time": "19m 23s"} +{"loss": 0.50463257, "grad_norm": 0.05117058, "learning_rate": 1.14e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025745, "rewards/chosen": 15.5625, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -502.0, "logps/chosen": -253.0, "logits/rejected": -1.625, "logits/chosen": -1.65625, "nll_loss": 0.46679688, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "1h 1m 17s", "remaining_time": "16m 7s"} +{"loss": 0.53009644, "grad_norm": 0.05141718, "learning_rate": 7.4e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025709, "rewards/chosen": 15.0, "rewards/rejected": 3.9375, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -498.0, "logps/chosen": -247.0, "logits/rejected": -1.6953125, "logits/chosen": -1.5859375, "nll_loss": 0.4296875, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 4m 36s", "remaining_time": "12m 55s"} +{"eval_loss": 0.453125, "eval_runtime": 3.8342, "eval_samples_per_second": 1.043, "eval_steps_per_second": 0.522, "eval_rewards/chosen": 18.625, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.25, "eval_logps/rejected": -240.0, "eval_logps/chosen": -768.0, "eval_logits/rejected": -1.328125, "eval_logits/chosen": -1.5859375, "eval_nll_loss": 0.546875, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 4m 40s", "remaining_time": "12m 56s"} +{"loss": 0.47074127, "grad_norm": 0.06586561, "learning_rate": 4.21e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025739, "rewards/chosen": 16.125, "rewards/rejected": 3.890625, "rewards/accuracies": 1.0, "rewards/margins": 12.1875, "logps/rejected": -450.0, "logps/chosen": -280.0, "logits/rejected": -1.5390625, "logits/chosen": -1.5703125, "nll_loss": 0.43945312, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "1h 7m 46s", "remaining_time": "9m 40s"} +{"loss": 0.52365417, "grad_norm": 0.05873221, "learning_rate": 1.89e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025778, "rewards/chosen": 17.25, "rewards/rejected": 4.96875, "rewards/accuracies": 1.0, "rewards/margins": 12.25, "logps/rejected": -596.0, "logps/chosen": -418.0, "logits/rejected": -1.578125, "logits/chosen": -1.6640625, "nll_loss": 0.55078125, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "1h 10m 54s", "remaining_time": "6m 26s"} +{"loss": 0.51962585, "grad_norm": 0.10912919, "learning_rate": 4.7e-07, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025827, "rewards/chosen": 17.375, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 12.75, "logps/rejected": -440.0, "logps/chosen": -374.0, "logits/rejected": -1.515625, "logits/chosen": -1.6484375, "nll_loss": 0.54296875, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "1h 13m 59s", "remaining_time": "3m 13s"} +{"loss": 0.44427338, "grad_norm": 0.06404241, "learning_rate": 0.0, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025823, "rewards/chosen": 17.625, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 12.4375, "logps/rejected": -536.0, "logps/chosen": -386.0, "logits/rejected": -1.453125, "logits/chosen": -1.546875, "nll_loss": 0.47265625, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 14s", "remaining_time": "0s"} +{"eval_loss": 0.45166016, "eval_runtime": 3.9949, "eval_samples_per_second": 1.001, "eval_steps_per_second": 0.501, "eval_rewards/chosen": 18.75, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.25, "eval_logps/rejected": -239.0, "eval_logps/chosen": -768.0, "eval_logits/rejected": -1.3125, "eval_logits/chosen": -1.578125, "eval_nll_loss": 0.54296875, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 18s", "remaining_time": "0s"} +{"train_runtime": 4639.7761, "train_samples_per_second": 0.427, "train_steps_per_second": 0.026, "total_flos": 64868340203520.0, "train_loss": 0.6644448, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 19s", "remaining_time": "0s"} +{"train_dataset": "1184.691919±553.980140, min=300.000000, max=6113.000000, size=396", "val_dataset": "1183.750000±508.140421, min=717.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 14804.4401M Params (34.4064M Trainable [0.2324%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/checkpoint-120", "best_metric": 0.45166016, "global_step": 120, "log_history": [{"loss": 1.84228515625, "grad_norm": 0.9959330518635272, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.014511, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -920.0, "logps/chosen": -466.0, "logits/rejected": -1.71875, "logits/chosen": -1.9375, "nll_loss": 1.4765625, "epoch": 0.04040404040404041, "step": 1}, {"loss": 2.35357666015625, "grad_norm": 1.0006093513724832, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.022719, "rewards/chosen": -0.0234375, "rewards/rejected": -0.0859375, "rewards/accuracies": 0.28125, "rewards/margins": 0.0625, "logps/rejected": -552.0, "logps/chosen": -576.0, "logits/rejected": -1.6015625, "logits/chosen": -1.71875, "nll_loss": 1.4140625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.395166015625, "grad_norm": 0.37515138030427864, "learning_rate": 9.969653386589748e-05, "memory(GiB)": 53.3, "train_speed(iter/s)": 0.024642, "rewards/chosen": 1.8125, "rewards/rejected": -0.30078125, "rewards/accuracies": 0.925000011920929, "rewards/margins": 2.109375, "logps/rejected": -496.0, "logps/chosen": -506.0, "logits/rejected": -1.46875, "logits/chosen": -1.7578125, "nll_loss": 0.9609375, "epoch": 0.40404040404040403, "step": 10}, {"loss": 1.206640625, "grad_norm": 0.365020087187755, "learning_rate": 9.847001329696653e-05, "memory(GiB)": 53.43, "train_speed(iter/s)": 0.025293, "rewards/chosen": 6.1875, "rewards/rejected": 0.76171875, "rewards/accuracies": 1.0, "rewards/margins": 5.4375, "logps/rejected": -516.0, "logps/chosen": -458.0, "logits/rejected": -1.671875, "logits/chosen": -1.9140625, "nll_loss": 1.1015625, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.68890380859375, "grad_norm": 0.10470765382319668, "learning_rate": 9.632470336074009e-05, "memory(GiB)": 53.43, "train_speed(iter/s)": 0.025626, "rewards/chosen": 10.0, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 5.71875, "logps/rejected": -466.0, "logps/chosen": -294.0, "logits/rejected": -1.859375, "logits/chosen": -1.828125, "nll_loss": 0.6953125, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.52001953125, "eval_runtime": 4.2519, "eval_samples_per_second": 0.941, "eval_steps_per_second": 0.47, "eval_rewards/chosen": 14.25, "eval_rewards/rejected": 5.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.75, "eval_logps/rejected": -229.0, "eval_logps/chosen": -812.0, "eval_logits/rejected": -1.46875, "eval_logits/chosen": -1.71875, "eval_nll_loss": 0.60546875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.65897216796875, "grad_norm": 0.0804463784398531, "learning_rate": 9.330127018922194e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025148, "rewards/chosen": 12.0625, "rewards/rejected": 5.21875, "rewards/accuracies": 1.0, "rewards/margins": 6.84375, "logps/rejected": -458.0, "logps/chosen": -376.0, "logits/rejected": -1.578125, "logits/chosen": -1.78125, "nll_loss": 0.609375, "epoch": 1.0404040404040404, "step": 25}, {"loss": 0.56932373046875, "grad_norm": 0.07518236632054026, "learning_rate": 8.945702546981969e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025393, "rewards/chosen": 13.9375, "rewards/rejected": 4.03125, "rewards/accuracies": 1.0, "rewards/margins": 9.875, "logps/rejected": -470.0, "logps/chosen": -370.0, "logits/rejected": -1.6484375, "logits/chosen": -1.78125, "nll_loss": 0.609375, "epoch": 1.2424242424242424, "step": 30}, {"loss": 0.557476806640625, "grad_norm": 0.06767411898397745, "learning_rate": 8.486484005469977e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025333, "rewards/chosen": 14.75, "rewards/rejected": 4.375, "rewards/accuracies": 1.0, "rewards/margins": 10.375, "logps/rejected": -394.0, "logps/chosen": -394.0, "logits/rejected": -1.5546875, "logits/chosen": -1.8203125, "nll_loss": 0.57421875, "epoch": 1.4444444444444444, "step": 35}, {"loss": 0.5288330078125, "grad_norm": 0.047636534461983134, "learning_rate": 7.961176263324901e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025528, "rewards/chosen": 14.6875, "rewards/rejected": 4.375, "rewards/accuracies": 1.0, "rewards/margins": 10.3125, "logps/rejected": -478.0, "logps/chosen": -368.0, "logits/rejected": -1.671875, "logits/chosen": -1.75, "nll_loss": 0.55859375, "epoch": 1.6464646464646466, "step": 40}, {"eval_loss": 0.46875, "eval_runtime": 4.1663, "eval_samples_per_second": 0.96, "eval_steps_per_second": 0.48, "eval_rewards/chosen": 17.0, "eval_rewards/rejected": 4.8125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.25, "eval_logps/rejected": -236.0, "eval_logps/chosen": -784.0, "eval_logits/rejected": -1.4375, "eval_logits/chosen": -1.6953125, "eval_nll_loss": 0.5625, "epoch": 1.6464646464646466, "step": 40}, {"loss": 0.5459747314453125, "grad_norm": 0.053771621820461274, "learning_rate": 7.379736965185368e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025633, "rewards/chosen": 14.875, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 9.9375, "logps/rejected": -540.0, "logps/chosen": -292.0, "logits/rejected": -1.7109375, "logits/chosen": -1.6171875, "nll_loss": 0.57421875, "epoch": 1.8484848484848486, "step": 45}, {"loss": 0.555657958984375, "grad_norm": 0.0556754575357442, "learning_rate": 6.753187775963773e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025504, "rewards/chosen": 14.9375, "rewards/rejected": 4.78125, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -424.0, "logps/chosen": -298.0, "logits/rejected": -1.5625, "logits/chosen": -1.609375, "nll_loss": 0.490234375, "epoch": 2.080808080808081, "step": 50}, {"loss": 0.5152923583984375, "grad_norm": 0.03050128350589889, "learning_rate": 6.09340545603188e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025525, "rewards/chosen": 15.5, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -392.0, "logps/chosen": -340.0, "logits/rejected": -1.546875, "logits/chosen": -1.8046875, "nll_loss": 0.53125, "epoch": 2.282828282828283, "step": 55}, {"loss": 0.51326904296875, "grad_norm": 0.03973909846789092, "learning_rate": 5.4128967273616625e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025583, "rewards/chosen": 16.5, "rewards/rejected": 4.875, "rewards/accuracies": 1.0, "rewards/margins": 11.625, "logps/rejected": -384.0, "logps/chosen": -356.0, "logits/rejected": -1.53125, "logits/chosen": -1.7578125, "nll_loss": 0.5, "epoch": 2.484848484848485, "step": 60}, {"eval_loss": 0.45703125, "eval_runtime": 4.123, "eval_samples_per_second": 0.97, "eval_steps_per_second": 0.485, "eval_rewards/chosen": 18.25, "eval_rewards/rejected": 4.65625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.625, "eval_logps/rejected": -237.0, "eval_logps/chosen": -772.0, "eval_logits/rejected": -1.3671875, "eval_logits/chosen": -1.625, "eval_nll_loss": 0.546875, "epoch": 2.484848484848485, "step": 60}, {"loss": 0.52603759765625, "grad_norm": 0.06262824842411734, "learning_rate": 4.7245611982206724e-05, "memory(GiB)": 54.89, "train_speed(iter/s)": 0.025605, "rewards/chosen": 15.8125, "rewards/rejected": 4.875, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -504.0, "logps/chosen": -354.0, "logits/rejected": -1.6171875, "logits/chosen": -1.6328125, "nll_loss": 0.5, "epoch": 2.686868686868687, "step": 65}, {"loss": 0.48192138671875, "grad_norm": 0.0481159192704273, "learning_rate": 4.0414468403813095e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025704, "rewards/chosen": 14.75, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -544.0, "logps/chosen": -251.0, "logits/rejected": -1.6484375, "logits/chosen": -1.5, "nll_loss": 0.474609375, "epoch": 2.888888888888889, "step": 70}, {"loss": 0.5461639404296875, "grad_norm": 0.04995228118845036, "learning_rate": 3.3765026539765834e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025622, "rewards/chosen": 17.0, "rewards/rejected": 5.09375, "rewards/accuracies": 1.0, "rewards/margins": 11.9375, "logps/rejected": -420.0, "logps/chosen": -364.0, "logits/rejected": -1.4609375, "logits/chosen": -1.7265625, "nll_loss": 0.455078125, "epoch": 3.121212121212121, "step": 75}, {"loss": 0.46167449951171874, "grad_norm": 0.04248238771472134, "learning_rate": 2.7423332084455544e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025696, "rewards/chosen": 16.5, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 11.9375, "logps/rejected": -364.0, "logps/chosen": -344.0, "logits/rejected": -1.515625, "logits/chosen": -1.703125, "nll_loss": 0.482421875, "epoch": 3.323232323232323, "step": 80}, {"eval_loss": 0.455078125, "eval_runtime": 4.274, "eval_samples_per_second": 0.936, "eval_steps_per_second": 0.468, "eval_rewards/chosen": 18.25, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.875, "eval_logps/rejected": -239.0, "eval_logps/chosen": -772.0, "eval_logits/rejected": -1.3359375, "eval_logits/chosen": -1.609375, "eval_nll_loss": 0.546875, "epoch": 3.323232323232323, "step": 80}, {"loss": 0.4924560546875, "grad_norm": 0.053616530504593876, "learning_rate": 2.150959712448669e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025636, "rewards/chosen": 17.375, "rewards/rejected": 4.21875, "rewards/accuracies": 1.0, "rewards/margins": 13.125, "logps/rejected": -488.0, "logps/chosen": -396.0, "logits/rejected": -1.578125, "logits/chosen": -1.65625, "nll_loss": 0.5234375, "epoch": 3.525252525252525, "step": 85}, {"loss": 0.4585693359375, "grad_norm": 0.04902185513812503, "learning_rate": 1.6135921418712956e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025692, "rewards/chosen": 16.25, "rewards/rejected": 3.8125, "rewards/accuracies": 1.0, "rewards/margins": 12.5, "logps/rejected": -478.0, "logps/chosen": -298.0, "logits/rejected": -1.5234375, "logits/chosen": -1.578125, "nll_loss": 0.455078125, "epoch": 3.7272727272727275, "step": 90}, {"loss": 0.504632568359375, "grad_norm": 0.051170579618764526, "learning_rate": 1.1404167454183957e-05, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025745, "rewards/chosen": 15.5625, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -502.0, "logps/chosen": -253.0, "logits/rejected": -1.625, "logits/chosen": -1.65625, "nll_loss": 0.466796875, "epoch": 3.929292929292929, "step": 95}, {"loss": 0.530096435546875, "grad_norm": 0.05141718150059864, "learning_rate": 7.404029558083653e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025709, "rewards/chosen": 15.0, "rewards/rejected": 3.9375, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -498.0, "logps/chosen": -247.0, "logits/rejected": -1.6953125, "logits/chosen": -1.5859375, "nll_loss": 0.4296875, "epoch": 4.161616161616162, "step": 100}, {"eval_loss": 0.453125, "eval_runtime": 3.8342, "eval_samples_per_second": 1.043, "eval_steps_per_second": 0.522, "eval_rewards/chosen": 18.625, "eval_rewards/rejected": 4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.25, "eval_logps/rejected": -240.0, "eval_logps/chosen": -768.0, "eval_logits/rejected": -1.328125, "eval_logits/chosen": -1.5859375, "eval_nll_loss": 0.546875, "epoch": 4.161616161616162, "step": 100}, {"loss": 0.47074127197265625, "grad_norm": 0.06586560796758323, "learning_rate": 4.2113336672471245e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025739, "rewards/chosen": 16.125, "rewards/rejected": 3.890625, "rewards/accuracies": 1.0, "rewards/margins": 12.1875, "logps/rejected": -450.0, "logps/chosen": -280.0, "logits/rejected": -1.5390625, "logits/chosen": -1.5703125, "nll_loss": 0.439453125, "epoch": 4.363636363636363, "step": 105}, {"loss": 0.5236541748046875, "grad_norm": 0.05873220563672292, "learning_rate": 1.8865999845374793e-06, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025778, "rewards/chosen": 17.25, "rewards/rejected": 4.96875, "rewards/accuracies": 1.0, "rewards/margins": 12.25, "logps/rejected": -596.0, "logps/chosen": -418.0, "logits/rejected": -1.578125, "logits/chosen": -1.6640625, "nll_loss": 0.55078125, "epoch": 4.565656565656566, "step": 110}, {"loss": 0.5196258544921875, "grad_norm": 0.10912918908815608, "learning_rate": 4.738957681248379e-07, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025827, "rewards/chosen": 17.375, "rewards/rejected": 4.59375, "rewards/accuracies": 1.0, "rewards/margins": 12.75, "logps/rejected": -440.0, "logps/chosen": -374.0, "logits/rejected": -1.515625, "logits/chosen": -1.6484375, "nll_loss": 0.54296875, "epoch": 4.767676767676767, "step": 115}, {"loss": 0.44427337646484377, "grad_norm": 0.06404241447513843, "learning_rate": 0.0, "memory(GiB)": 56.74, "train_speed(iter/s)": 0.025823, "rewards/chosen": 17.625, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 12.4375, "logps/rejected": -536.0, "logps/chosen": -386.0, "logits/rejected": -1.453125, "logits/chosen": -1.546875, "nll_loss": 0.47265625, "epoch": 4.96969696969697, "step": 120}, {"eval_loss": 0.45166015625, "eval_runtime": 3.9949, "eval_samples_per_second": 1.001, "eval_steps_per_second": 0.501, "eval_rewards/chosen": 18.75, "eval_rewards/rejected": 4.4375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.25, "eval_logps/rejected": -239.0, "eval_logps/chosen": -768.0, "eval_logits/rejected": -1.3125, "eval_logits/chosen": -1.578125, "eval_nll_loss": 0.54296875, "epoch": 4.96969696969697, "step": 120}, {"train_runtime": 4639.7761, "train_samples_per_second": 0.427, "train_steps_per_second": 0.026, "total_flos": 64868340203520.0, "train_loss": 0.6644447962443034, "epoch": 4.96969696969697, "step": 120}], "memory": 56.736328125} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs/events.out.tfevents.1737751069.kml-dtmachine-18088-prod.55191.0 b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs/events.out.tfevents.1737751069.kml-dtmachine-18088-prod.55191.0 new file mode 100644 index 0000000000000000000000000000000000000000..30f04115208a74217a811a9a50ab5d3885471065 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_random20/v0-20250124-203652/runs/events.out.tfevents.1737751069.kml-dtmachine-18088-prod.55191.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:230f9306baa9e021adf55df2c8483520650b8ecd855c1dd8bdf7a1611a993933 +size 33753 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..218505a389a8abe93dc0ced8e1d9ce0b424dff4c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5fdbae93168c2bd3cf2006c13757a7d43a7950e3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda7f681a58fcb762ac01f351c31e6b5a17d9d905e378c8cbee84fdbae2615f2 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e427f94211ac17daa728f68a90d422b18f5b475 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a1a5a75abbc46337692397f90f789b113c78a09df6542d760fe2073c6ab19b +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e88e752e77cca18cbbfd695c0cf1060be9be620 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb37d7bb03a9e3b31b7516734ca164ead1d69e23a91c39afb6ff7c0832f2c004 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a43cd493bfff38acaa692b61da877d08808c8d03 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e06976ef4c3c32acc07db90f2553b5c853042b8ca1a43ed089e4b947ac8238 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9565d4714cc1d67720e7184ba2c456961eb94fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74075f57211a8dfa6b22041d44272db794a95351cb4b97b06c931f4957df6572 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a57076f7cecb0f02622ebad5f4481d4c3d7997e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.44970703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9622549154192331, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 50.11, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014247 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9351676023631289, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.420166015625, + "memory(GiB)": 50.12, + "nll_loss": 1.453125, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015625, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.0031280517578125, + "step": 5, + "train_speed(iter/s)": 0.019723 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7493921719992903, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -512.0, + "logps/rejected": -488.0, + "loss": 1.6756103515625, + "memory(GiB)": 50.12, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3671875, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.5, + "step": 10, + "train_speed(iter/s)": 0.021058 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.5025734018127719, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.671875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 1.52564697265625, + "memory(GiB)": 50.12, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.09375, + "rewards/margins": 2.5625, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.021613 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13150315476978253, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8284912109375, + "memory(GiB)": 50.12, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.125, + "rewards/margins": 4.15625, + "rewards/rejected": 5.9375, + "step": 20, + "train_speed(iter/s)": 0.021885 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -217.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 6.8125, + "eval_runtime": 4.6538, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09166399117718381, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.625, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.67142333984375, + "memory(GiB)": 50.25, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.625, + "rewards/rejected": 7.25, + "step": 25, + "train_speed(iter/s)": 0.021519 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06340553413984125, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -452.0, + "loss": 0.57769775390625, + "memory(GiB)": 50.25, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.3125, + "rewards/rejected": 5.71875, + "step": 30, + "train_speed(iter/s)": 0.021644 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.0632161482695805, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.5692626953125, + "memory(GiB)": 50.25, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.625, + "rewards/margins": 9.375, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.021577 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.053103368947490286, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -472.0, + "loss": 0.541656494140625, + "memory(GiB)": 50.25, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.625, + "rewards/rejected": 4.84375, + "step": 40, + "train_speed(iter/s)": 0.021811 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5078125, + "eval_logps/chosen": -788.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.4697265625, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.5, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 5.59375, + "eval_runtime": 5.3016, + "eval_samples_per_second": 0.754, + "eval_steps_per_second": 0.377, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.061679603479570924, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5596527099609375, + "memory(GiB)": 50.25, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 9.6875, + "rewards/rejected": 5.0625, + "step": 45, + "train_speed(iter/s)": 0.021921 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.055295440323398615, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.574530029296875, + "memory(GiB)": 50.25, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.021823 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03219995141738499, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.609375, + "logps/chosen": -340.0, + "logps/rejected": -382.0, + "loss": 0.525201416015625, + "memory(GiB)": 51.7, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.4375, + "rewards/margins": 9.8125, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02183 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04232837527263778, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.5289306640625, + "memory(GiB)": 51.7, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.021852 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -221.0, + "eval_loss": 0.45556640625, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 6.40625, + "eval_runtime": 5.3977, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.371, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.08354331862094756, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6796875, + "logits/rejected": -1.6953125, + "logps/chosen": -358.0, + "logps/rejected": -496.0, + "loss": 0.5365966796875, + "memory(GiB)": 51.7, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 9.9375, + "rewards/rejected": 5.71875, + "step": 65, + "train_speed(iter/s)": 0.021884 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.04503346258794718, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.546875, + "logits/rejected": -1.703125, + "logps/chosen": -253.0, + "logps/rejected": -532.0, + "loss": 0.49569091796875, + "memory(GiB)": 51.84, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.4375, + "rewards/rejected": 5.25, + "step": 70, + "train_speed(iter/s)": 0.02195 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.05428668175176237, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.4921875, + "logps/chosen": -368.0, + "logps/rejected": -408.0, + "loss": 0.561590576171875, + "memory(GiB)": 51.84, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 10.3125, + "rewards/rejected": 6.40625, + "step": 75, + "train_speed(iter/s)": 0.021848 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.045056732174556025, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.5546875, + "logps/chosen": -346.0, + "logps/rejected": -352.0, + "loss": 0.470965576171875, + "memory(GiB)": 51.84, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 10.625, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.021895 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.45068359375, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.4375, + "eval_rewards/rejected": 6.25, + "eval_runtime": 5.3733, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.372, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.048909228620193326, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -400.0, + "logps/rejected": -476.0, + "loss": 0.506683349609375, + "memory(GiB)": 51.84, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.125, + "rewards/margins": 11.75, + "rewards/rejected": 5.3125, + "step": 85, + "train_speed(iter/s)": 0.0218 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.043787724459711776, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -302.0, + "logps/rejected": -468.0, + "loss": 0.470556640625, + "memory(GiB)": 51.84, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 11.0625, + "rewards/rejected": 5.0625, + "step": 90, + "train_speed(iter/s)": 0.021818 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.04996232082988063, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6953125, + "logps/chosen": -258.0, + "logps/rejected": -492.0, + "loss": 0.5208251953125, + "memory(GiB)": 51.84, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 5.34375, + "step": 95, + "train_speed(iter/s)": 0.021881 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.04511138201034887, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.765625, + "logps/chosen": -250.0, + "logps/rejected": -486.0, + "loss": 0.5463577270507812, + "memory(GiB)": 51.84, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 9.8125, + "rewards/rejected": 5.125, + "step": 100, + "train_speed(iter/s)": 0.021935 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.359375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -224.0, + "eval_loss": 0.44970703125, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.625, + "eval_rewards/rejected": 6.0625, + "eval_runtime": 4.063, + "eval_samples_per_second": 0.984, + "eval_steps_per_second": 0.492, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 53748920221696.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9089b346684edd668e861b8e56d9b6c791e891bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822009ebea2ee219263dd2da52e915b92fa67b0c2ba823e672a9569d97c36fc9 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..218505a389a8abe93dc0ced8e1d9ce0b424dff4c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..761ca8382ceabc371481895078b69e3573fc30c8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c384bb8a49a3f063fb4500a733fb7034b3f24f666aa7027108e06573a79c2ec +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a1e9b886517c8f115d41a850586db37c5050b8d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5229f3209479cb72c05518f17a46bf3b9d2f1216f433e09aa28762a36b8c9be5 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f20a3ade09b76895743802f600cfe195b7d509b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fad9248ea2b2c19baaf25824e719e4137a2cf610bbc4217abee7c8c0aab84543 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17cea2764a01871f6a1a26b88474fb83432e2342 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd8d20e0daa19f3da74a53ded05c8904aab682db343ed38f6419a9e8b219d47 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13dbdb5c086a404044eed7d3073796d64d71c67d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4351eebfddfbc95fb4f0c31aa99c4669a970f5d1dd67f76652cef5195f669754 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..784fe4dec3dcc5cce4080fdd872737750312eb5a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.55615234, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9622549154192331, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 50.11, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014247 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9351676023631289, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.420166015625, + "memory(GiB)": 50.12, + "nll_loss": 1.453125, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015625, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.0031280517578125, + "step": 5, + "train_speed(iter/s)": 0.019723 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7493921719992903, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -512.0, + "logps/rejected": -488.0, + "loss": 1.6756103515625, + "memory(GiB)": 50.12, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3671875, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.5, + "step": 10, + "train_speed(iter/s)": 0.021058 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.5025734018127719, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.671875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 1.52564697265625, + "memory(GiB)": 50.12, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.09375, + "rewards/margins": 2.5625, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.021613 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13150315476978253, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8284912109375, + "memory(GiB)": 50.12, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.125, + "rewards/margins": 4.15625, + "rewards/rejected": 5.9375, + "step": 20, + "train_speed(iter/s)": 0.021885 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -217.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 6.8125, + "eval_runtime": 4.6538, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 11038730616832.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9089b346684edd668e861b8e56d9b6c791e891bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822009ebea2ee219263dd2da52e915b92fa67b0c2ba823e672a9569d97c36fc9 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..218505a389a8abe93dc0ced8e1d9ce0b424dff4c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..242ad65f3505390296fd4360af91787e37c047b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c796dd339de5104ecf386c7ca9dca8e1b721e13165a8ab09772153489eeb8a +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd92569a7b357fa86644804015d2ed222218b82a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:026426630187953efcc6b2c7c59ffdc34565c107472b170a222404a11e54be51 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb1120b2968d19e62b3330236f9f319f11f44371 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8076fc2467b00c8cac02421f7cd7621f1be3d2e0855625a527030f1dcca2228d +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..200e6b2143a228c693b631a71e9f3f8cd613d916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f261a0a7746bfdfbe379d61c4b9a1a3101c15fb35a7727443bb97548aa3453 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f92a5227f433763962cf4fe746c5f3b9fc1e078 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a53d8136ed04e678c2604a1b84bd5830e6503d57715c1347a37f71f219591523 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c9a2a88cab4c0aed2694913261c6fe9151f07672 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.46972656, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9622549154192331, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 50.11, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014247 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9351676023631289, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.420166015625, + "memory(GiB)": 50.12, + "nll_loss": 1.453125, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015625, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.0031280517578125, + "step": 5, + "train_speed(iter/s)": 0.019723 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7493921719992903, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -512.0, + "logps/rejected": -488.0, + "loss": 1.6756103515625, + "memory(GiB)": 50.12, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3671875, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.5, + "step": 10, + "train_speed(iter/s)": 0.021058 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.5025734018127719, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.671875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 1.52564697265625, + "memory(GiB)": 50.12, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.09375, + "rewards/margins": 2.5625, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.021613 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13150315476978253, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8284912109375, + "memory(GiB)": 50.12, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.125, + "rewards/margins": 4.15625, + "rewards/rejected": 5.9375, + "step": 20, + "train_speed(iter/s)": 0.021885 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -217.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 6.8125, + "eval_runtime": 4.6538, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09166399117718381, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.625, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.67142333984375, + "memory(GiB)": 50.25, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.625, + "rewards/rejected": 7.25, + "step": 25, + "train_speed(iter/s)": 0.021519 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06340553413984125, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -452.0, + "loss": 0.57769775390625, + "memory(GiB)": 50.25, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.3125, + "rewards/rejected": 5.71875, + "step": 30, + "train_speed(iter/s)": 0.021644 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.0632161482695805, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.5692626953125, + "memory(GiB)": 50.25, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.625, + "rewards/margins": 9.375, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.021577 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.053103368947490286, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -472.0, + "loss": 0.541656494140625, + "memory(GiB)": 50.25, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.625, + "rewards/rejected": 4.84375, + "step": 40, + "train_speed(iter/s)": 0.021811 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5078125, + "eval_logps/chosen": -788.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.4697265625, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.5, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 5.59375, + "eval_runtime": 5.3016, + "eval_samples_per_second": 0.754, + "eval_steps_per_second": 0.377, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 22611208306688.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9089b346684edd668e861b8e56d9b6c791e891bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822009ebea2ee219263dd2da52e915b92fa67b0c2ba823e672a9569d97c36fc9 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..218505a389a8abe93dc0ced8e1d9ce0b424dff4c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cdc006be2b22aec22dcf304c3bdf9c4eb5352eb3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:676cd36de970efda750279caa0a51db30cd9f7e5956881464f963dfa6e471da1 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69081126b67302d7697feab0658fad68ef5b1621 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ba2dad9e8fb3489fe632e7f275e808d611b307070ff5628852793951b490e9 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0f7ca56ff20cd1cb11b9067e06612297bb11c1f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91070ab185a250f1c2bc43e6ea5a5d90679d9ffd6866ed226c12e0efc16d863b +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c33a8f6b932e36445db5646999b5b2be7158d41 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c8fcfc5a7e3a3997ccd385accb83d8a322b4cdd7020d7f9ec9dbf789f4948f +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8afdbe4e7073be1f1d59805bedc3570347b027a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4329013567adcc4dbe71958d8df58ceadb8514e5968a5074267338076d097d36 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3d4ef6da301d2808860ccfcd3080f7f9d49a197 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.45556641, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9622549154192331, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 50.11, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014247 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9351676023631289, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.420166015625, + "memory(GiB)": 50.12, + "nll_loss": 1.453125, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015625, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.0031280517578125, + "step": 5, + "train_speed(iter/s)": 0.019723 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7493921719992903, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -512.0, + "logps/rejected": -488.0, + "loss": 1.6756103515625, + "memory(GiB)": 50.12, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3671875, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.5, + "step": 10, + "train_speed(iter/s)": 0.021058 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.5025734018127719, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.671875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 1.52564697265625, + "memory(GiB)": 50.12, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.09375, + "rewards/margins": 2.5625, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.021613 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13150315476978253, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8284912109375, + "memory(GiB)": 50.12, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.125, + "rewards/margins": 4.15625, + "rewards/rejected": 5.9375, + "step": 20, + "train_speed(iter/s)": 0.021885 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -217.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 6.8125, + "eval_runtime": 4.6538, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09166399117718381, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.625, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.67142333984375, + "memory(GiB)": 50.25, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.625, + "rewards/rejected": 7.25, + "step": 25, + "train_speed(iter/s)": 0.021519 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06340553413984125, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -452.0, + "loss": 0.57769775390625, + "memory(GiB)": 50.25, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.3125, + "rewards/rejected": 5.71875, + "step": 30, + "train_speed(iter/s)": 0.021644 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.0632161482695805, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.5692626953125, + "memory(GiB)": 50.25, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.625, + "rewards/margins": 9.375, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.021577 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.053103368947490286, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -472.0, + "loss": 0.541656494140625, + "memory(GiB)": 50.25, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.625, + "rewards/rejected": 4.84375, + "step": 40, + "train_speed(iter/s)": 0.021811 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5078125, + "eval_logps/chosen": -788.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.4697265625, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.5, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 5.59375, + "eval_runtime": 5.3016, + "eval_samples_per_second": 0.754, + "eval_steps_per_second": 0.377, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.061679603479570924, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5596527099609375, + "memory(GiB)": 50.25, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 9.6875, + "rewards/rejected": 5.0625, + "step": 45, + "train_speed(iter/s)": 0.021921 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.055295440323398615, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.574530029296875, + "memory(GiB)": 50.25, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.021823 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03219995141738499, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.609375, + "logps/chosen": -340.0, + "logps/rejected": -382.0, + "loss": 0.525201416015625, + "memory(GiB)": 51.7, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.4375, + "rewards/margins": 9.8125, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02183 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04232837527263778, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.5289306640625, + "memory(GiB)": 51.7, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.021852 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -221.0, + "eval_loss": 0.45556640625, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 6.40625, + "eval_runtime": 5.3977, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.371, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 32332961415168.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9089b346684edd668e861b8e56d9b6c791e891bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822009ebea2ee219263dd2da52e915b92fa67b0c2ba823e672a9569d97c36fc9 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..218505a389a8abe93dc0ced8e1d9ce0b424dff4c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab306f54e924bb36f6cb6190e4a003001da1dd65 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d163e1edf471dbd06c3b40d78b00f16767b6c73ce14a6d511aba7fa64e35ea +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..46330eae1f892f66b55a37b7f9c020105166ddc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d17239d0e628091d3d229438f754a9c37e2ad2da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aad37f91cc173831a497409a1e53e39daa112a48ae53a6e757bceb24966ce16 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cddf4fe1ae682c0316878e67073881bccb965a51 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf03c54e9753381c8c2147dfe433b5a33eb96f1559e28dbc0fe216561630450b +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d8c7c4bf7c9f614a9deb6d953c0232042518406 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f51a19a37f1004f4021b66cc177a82f08af3abc5519c61ff90b124392a844b4 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68c431fa12e41f18e7ed8843913a786faa1e141b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fc6136fd667208aa14d49b962c185b4c43086df2120b8a85aae19aa237e0c9 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fb1eb170d639b8acad3dadd4f2cb1a883c5995 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.45068359, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9622549154192331, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 50.11, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014247 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9351676023631289, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.420166015625, + "memory(GiB)": 50.12, + "nll_loss": 1.453125, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015625, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.0031280517578125, + "step": 5, + "train_speed(iter/s)": 0.019723 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7493921719992903, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -512.0, + "logps/rejected": -488.0, + "loss": 1.6756103515625, + "memory(GiB)": 50.12, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3671875, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.5, + "step": 10, + "train_speed(iter/s)": 0.021058 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.5025734018127719, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.671875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 1.52564697265625, + "memory(GiB)": 50.12, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.09375, + "rewards/margins": 2.5625, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.021613 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13150315476978253, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8284912109375, + "memory(GiB)": 50.12, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.125, + "rewards/margins": 4.15625, + "rewards/rejected": 5.9375, + "step": 20, + "train_speed(iter/s)": 0.021885 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -217.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.40625, + "eval_rewards/rejected": 6.8125, + "eval_runtime": 4.6538, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09166399117718381, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.625, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.67142333984375, + "memory(GiB)": 50.25, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.625, + "rewards/rejected": 7.25, + "step": 25, + "train_speed(iter/s)": 0.021519 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06340553413984125, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -452.0, + "loss": 0.57769775390625, + "memory(GiB)": 50.25, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.3125, + "rewards/rejected": 5.71875, + "step": 30, + "train_speed(iter/s)": 0.021644 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.0632161482695805, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.5692626953125, + "memory(GiB)": 50.25, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.625, + "rewards/margins": 9.375, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.021577 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.053103368947490286, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -472.0, + "loss": 0.541656494140625, + "memory(GiB)": 50.25, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.625, + "rewards/rejected": 4.84375, + "step": 40, + "train_speed(iter/s)": 0.021811 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5078125, + "eval_logps/chosen": -788.0, + "eval_logps/rejected": -229.0, + "eval_loss": 0.4697265625, + "eval_nll_loss": 0.5625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.5, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 5.59375, + "eval_runtime": 5.3016, + "eval_samples_per_second": 0.754, + "eval_steps_per_second": 0.377, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.061679603479570924, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5596527099609375, + "memory(GiB)": 50.25, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 9.6875, + "rewards/rejected": 5.0625, + "step": 45, + "train_speed(iter/s)": 0.021921 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.055295440323398615, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.574530029296875, + "memory(GiB)": 50.25, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.021823 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03219995141738499, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.609375, + "logps/chosen": -340.0, + "logps/rejected": -382.0, + "loss": 0.525201416015625, + "memory(GiB)": 51.7, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.4375, + "rewards/margins": 9.8125, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02183 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04232837527263778, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.5289306640625, + "memory(GiB)": 51.7, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.021852 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -221.0, + "eval_loss": 0.45556640625, + "eval_nll_loss": 0.546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": 6.40625, + "eval_runtime": 5.3977, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.371, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.08354331862094756, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6796875, + "logits/rejected": -1.6953125, + "logps/chosen": -358.0, + "logps/rejected": -496.0, + "loss": 0.5365966796875, + "memory(GiB)": 51.7, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 9.9375, + "rewards/rejected": 5.71875, + "step": 65, + "train_speed(iter/s)": 0.021884 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.04503346258794718, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.546875, + "logits/rejected": -1.703125, + "logps/chosen": -253.0, + "logps/rejected": -532.0, + "loss": 0.49569091796875, + "memory(GiB)": 51.84, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.4375, + "rewards/rejected": 5.25, + "step": 70, + "train_speed(iter/s)": 0.02195 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.05428668175176237, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.4921875, + "logps/chosen": -368.0, + "logps/rejected": -408.0, + "loss": 0.561590576171875, + "memory(GiB)": 51.84, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 10.3125, + "rewards/rejected": 6.40625, + "step": 75, + "train_speed(iter/s)": 0.021848 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.045056732174556025, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.5546875, + "logps/chosen": -346.0, + "logps/rejected": -352.0, + "loss": 0.470965576171875, + "memory(GiB)": 51.84, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 10.625, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.021895 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.3671875, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.45068359375, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.4375, + "eval_rewards/rejected": 6.25, + "eval_runtime": 5.3733, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.372, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 42772328611840.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9089b346684edd668e861b8e56d9b6c791e891bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822009ebea2ee219263dd2da52e915b92fa67b0c2ba823e672a9569d97c36fc9 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/logging.jsonl b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..872a27c2ce55afabfa68295e8061e067153f48ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/logging.jsonl @@ -0,0 +1,29 @@ +{"loss": 1.84960938, "grad_norm": 0.96225492, "learning_rate": 1.667e-05, "memory(GiB)": 50.11, "train_speed(iter/s)": 0.014247, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -916.0, "logps/chosen": -466.0, "logits/rejected": -1.7265625, "logits/chosen": -1.9296875, "nll_loss": 1.4765625, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "1m 4s", "remaining_time": "2h 8m 16s"} +{"loss": 2.42016602, "grad_norm": 0.9351676, "learning_rate": 8.333e-05, "memory(GiB)": 50.12, "train_speed(iter/s)": 0.019723, "rewards/chosen": 0.015625, "rewards/rejected": -0.00312805, "rewards/accuracies": 0.25, "rewards/margins": 0.01879883, "logps/rejected": -552.0, "logps/chosen": -576.0, "logits/rejected": -1.609375, "logits/chosen": -1.625, "nll_loss": 1.453125, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "4m 7s", "remaining_time": "1h 35m 3s"} +{"loss": 1.67561035, "grad_norm": 0.74939217, "learning_rate": 9.97e-05, "memory(GiB)": 50.12, "train_speed(iter/s)": 0.021058, "rewards/chosen": 1.3671875, "rewards/rejected": 0.5, "rewards/accuracies": 0.625, "rewards/margins": 0.87109375, "logps/rejected": -488.0, "logps/chosen": -512.0, "logits/rejected": -1.46875, "logits/chosen": -1.7421875, "nll_loss": 0.98046875, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "7m 49s", "remaining_time": "1h 26m 2s"} +{"loss": 1.52564697, "grad_norm": 0.5025734, "learning_rate": 9.847e-05, "memory(GiB)": 50.12, "train_speed(iter/s)": 0.021613, "rewards/chosen": 6.09375, "rewards/rejected": 3.53125, "rewards/accuracies": 0.89999998, "rewards/margins": 2.5625, "logps/rejected": -488.0, "logps/chosen": -460.0, "logits/rejected": -1.671875, "logits/chosen": -1.828125, "nll_loss": 1.1875, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "11m 28s", "remaining_time": "1h 20m 19s"} +{"loss": 0.82849121, "grad_norm": 0.13150315, "learning_rate": 9.632e-05, "memory(GiB)": 50.12, "train_speed(iter/s)": 0.021885, "rewards/chosen": 10.125, "rewards/rejected": 5.9375, "rewards/accuracies": 0.97500002, "rewards/margins": 4.15625, "logps/rejected": -450.0, "logps/chosen": -294.0, "logits/rejected": -1.8515625, "logits/chosen": -1.7109375, "nll_loss": 0.7734375, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "15m 8s", "remaining_time": "1h 15m 41s"} +{"eval_loss": 0.55615234, "eval_runtime": 4.6538, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.43, "eval_rewards/chosen": 14.25, "eval_rewards/rejected": 6.8125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.40625, "eval_logps/rejected": -217.0, "eval_logps/chosen": -812.0, "eval_logits/rejected": -1.515625, "eval_logits/chosen": -1.625, "eval_nll_loss": 0.6875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "15m 12s", "remaining_time": "1h 16m 4s"} +{"loss": 0.67142334, "grad_norm": 0.09166399, "learning_rate": 9.33e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021519, "rewards/chosen": 12.875, "rewards/rejected": 7.25, "rewards/accuracies": 1.0, "rewards/margins": 5.625, "logps/rejected": -438.0, "logps/chosen": -370.0, "logits/rejected": -1.625, "logits/chosen": -1.7890625, "nll_loss": 0.60546875, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "19m 16s", "remaining_time": "1h 13m 13s"} +{"loss": 0.57769775, "grad_norm": 0.06340553, "learning_rate": 8.946e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021644, "rewards/chosen": 14.0, "rewards/rejected": 5.71875, "rewards/accuracies": 1.0, "rewards/margins": 8.3125, "logps/rejected": -452.0, "logps/chosen": -370.0, "logits/rejected": -1.703125, "logits/chosen": -1.796875, "nll_loss": 0.62109375, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "23m 0s", "remaining_time": "1h 9m 1s"} +{"loss": 0.5692627, "grad_norm": 0.06321615, "learning_rate": 8.486e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021577, "rewards/chosen": 14.625, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.375, "logps/rejected": -384.0, "logps/chosen": -398.0, "logits/rejected": -1.59375, "logits/chosen": -1.828125, "nll_loss": 0.58203125, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "26m 56s", "remaining_time": "1h 5m 25s"} +{"loss": 0.54165649, "grad_norm": 0.05310337, "learning_rate": 7.961e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021811, "rewards/chosen": 14.5, "rewards/rejected": 4.84375, "rewards/accuracies": 1.0, "rewards/margins": 9.625, "logps/rejected": -472.0, "logps/chosen": -372.0, "logits/rejected": -1.7578125, "logits/chosen": -1.7109375, "nll_loss": 0.57421875, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "30m 28s", "remaining_time": "1h 0m 56s"} +{"eval_loss": 0.46972656, "eval_runtime": 5.3016, "eval_samples_per_second": 0.754, "eval_steps_per_second": 0.377, "eval_rewards/chosen": 16.5, "eval_rewards/rejected": 5.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.875, "eval_logps/rejected": -229.0, "eval_logps/chosen": -788.0, "eval_logits/rejected": -1.5078125, "eval_logits/chosen": -1.6875, "eval_nll_loss": 0.5625, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "30m 33s", "remaining_time": "1h 1m 7s"} +{"loss": 0.55965271, "grad_norm": 0.0616796, "learning_rate": 7.38e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021921, "rewards/chosen": 14.75, "rewards/rejected": 5.0625, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -540.0, "logps/chosen": -296.0, "logits/rejected": -1.78125, "logits/chosen": -1.609375, "nll_loss": 0.58984375, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "34m 7s", "remaining_time": "56m 52s"} +{"loss": 0.57453003, "grad_norm": 0.05529544, "learning_rate": 6.753e-05, "memory(GiB)": 50.25, "train_speed(iter/s)": 0.021823, "rewards/chosen": 14.875, "rewards/rejected": 5.53125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -418.0, "logps/chosen": -302.0, "logits/rejected": -1.640625, "logits/chosen": -1.6953125, "nll_loss": 0.5, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "38m 5s", "remaining_time": "53m 19s"} +{"loss": 0.52520142, "grad_norm": 0.03219995, "learning_rate": 6.093e-05, "memory(GiB)": 51.7, "train_speed(iter/s)": 0.02183, "rewards/chosen": 15.4375, "rewards/rejected": 5.59375, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -382.0, "logps/chosen": -340.0, "logits/rejected": -1.609375, "logits/chosen": -1.8515625, "nll_loss": 0.5390625, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "41m 53s", "remaining_time": "49m 31s"} +{"loss": 0.52893066, "grad_norm": 0.04232838, "learning_rate": 5.413e-05, "memory(GiB)": 51.7, "train_speed(iter/s)": 0.021852, "rewards/chosen": 16.25, "rewards/rejected": 5.78125, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -374.0, "logps/chosen": -358.0, "logits/rejected": -1.6171875, "logits/chosen": -1.8046875, "nll_loss": 0.5078125, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "45m 40s", "remaining_time": "45m 40s"} +{"eval_loss": 0.45556641, "eval_runtime": 5.3977, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.371, "eval_rewards/chosen": 17.25, "eval_rewards/rejected": 6.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.875, "eval_logps/rejected": -221.0, "eval_logps/chosen": -780.0, "eval_logits/rejected": -1.4140625, "eval_logits/chosen": -1.65625, "eval_nll_loss": 0.546875, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "45m 45s", "remaining_time": "45m 45s"} +{"loss": 0.53659668, "grad_norm": 0.08354332, "learning_rate": 4.725e-05, "memory(GiB)": 51.7, "train_speed(iter/s)": 0.021884, "rewards/chosen": 15.6875, "rewards/rejected": 5.71875, "rewards/accuracies": 1.0, "rewards/margins": 9.9375, "logps/rejected": -496.0, "logps/chosen": -358.0, "logits/rejected": -1.6953125, "logits/chosen": -1.6796875, "nll_loss": 0.515625, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "49m 24s", "remaining_time": "41m 48s"} +{"loss": 0.49569092, "grad_norm": 0.04503346, "learning_rate": 4.041e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.02195, "rewards/chosen": 14.6875, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.4375, "logps/rejected": -532.0, "logps/chosen": -253.0, "logits/rejected": -1.703125, "logits/chosen": -1.546875, "nll_loss": 0.484375, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "53m 3s", "remaining_time": "37m 53s"} +{"loss": 0.56159058, "grad_norm": 0.05428668, "learning_rate": 3.377e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.021848, "rewards/chosen": 16.75, "rewards/rejected": 6.40625, "rewards/accuracies": 1.0, "rewards/margins": 10.3125, "logps/rejected": -408.0, "logps/chosen": -368.0, "logits/rejected": -1.4921875, "logits/chosen": -1.78125, "nll_loss": 0.47070312, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "57m 7s", "remaining_time": "34m 16s"} +{"loss": 0.47096558, "grad_norm": 0.04505673, "learning_rate": 2.742e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.021895, "rewards/chosen": 16.5, "rewards/rejected": 5.84375, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -352.0, "logps/chosen": -346.0, "logits/rejected": -1.5546875, "logits/chosen": -1.7265625, "nll_loss": 0.49023438, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "1h 0m 48s", "remaining_time": "30m 24s"} +{"eval_loss": 0.45068359, "eval_runtime": 5.3733, "eval_samples_per_second": 0.744, "eval_steps_per_second": 0.372, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.25, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.4375, "eval_logps/rejected": -222.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.3671875, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.5390625, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "1h 0m 53s", "remaining_time": "30m 26s"} +{"loss": 0.50668335, "grad_norm": 0.04890923, "learning_rate": 2.151e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.0218, "rewards/chosen": 17.125, "rewards/rejected": 5.3125, "rewards/accuracies": 1.0, "rewards/margins": 11.75, "logps/rejected": -476.0, "logps/chosen": -400.0, "logits/rejected": -1.640625, "logits/chosen": -1.6953125, "nll_loss": 0.54296875, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "1h 4m 53s", "remaining_time": "26m 43s"} +{"loss": 0.47055664, "grad_norm": 0.04378772, "learning_rate": 1.614e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.021818, "rewards/chosen": 16.125, "rewards/rejected": 5.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -468.0, "logps/chosen": -302.0, "logits/rejected": -1.5625, "logits/chosen": -1.609375, "nll_loss": 0.46484375, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "1h 8m 39s", "remaining_time": "22m 53s"} +{"loss": 0.5208252, "grad_norm": 0.04996232, "learning_rate": 1.14e-05, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.021881, "rewards/chosen": 15.5625, "rewards/rejected": 5.34375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -492.0, "logps/chosen": -258.0, "logits/rejected": -1.6953125, "logits/chosen": -1.625, "nll_loss": 0.48632812, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "1h 12m 16s", "remaining_time": "19m 1s"} +{"loss": 0.54635773, "grad_norm": 0.04511138, "learning_rate": 7.4e-06, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.021935, "rewards/chosen": 15.0, "rewards/rejected": 5.125, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -486.0, "logps/chosen": -250.0, "logits/rejected": -1.765625, "logits/chosen": -1.6328125, "nll_loss": 0.43945312, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 15m 53s", "remaining_time": "15m 10s"} +{"eval_loss": 0.44970703, "eval_runtime": 4.063, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.492, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.625, "eval_logps/rejected": -224.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.359375, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.5390625, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 15m 57s", "remaining_time": "15m 11s"} +{"loss": 0.48547974, "grad_norm": 0.05678748, "learning_rate": 4.21e-06, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.02209, "rewards/chosen": 16.0, "rewards/rejected": 5.21875, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -436.0, "logps/chosen": -284.0, "logits/rejected": -1.5859375, "logits/chosen": -1.6328125, "nll_loss": 0.45507812, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "1h 19m 7s", "remaining_time": "11m 18s"} +{"loss": 0.53600464, "grad_norm": 0.05514065, "learning_rate": 1.89e-06, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.022256, "rewards/chosen": 16.875, "rewards/rejected": 6.0, "rewards/accuracies": 1.0, "rewards/margins": 10.9375, "logps/rejected": -588.0, "logps/chosen": -422.0, "logits/rejected": -1.625, "logits/chosen": -1.6953125, "nll_loss": 0.55859375, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "1h 22m 17s", "remaining_time": "7m 28s"} +{"loss": 0.53613892, "grad_norm": 0.08434715, "learning_rate": 4.7e-07, "memory(GiB)": 51.84, "train_speed(iter/s)": 0.022414, "rewards/chosen": 17.25, "rewards/rejected": 5.84375, "rewards/accuracies": 1.0, "rewards/margins": 11.375, "logps/rejected": -428.0, "logps/chosen": -378.0, "logits/rejected": -1.546875, "logits/chosen": -1.6796875, "nll_loss": 0.5546875, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "1h 25m 25s", "remaining_time": "3m 42s"} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs/events.out.tfevents.1737732221.kml-dtmachine-18088-prod.99991.0 b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs/events.out.tfevents.1737732221.kml-dtmachine-18088-prod.99991.0 new file mode 100644 index 0000000000000000000000000000000000000000..c4ac5b55f550bc25be33aaa78eea51ed8e019aa6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v0-20250124-152200/runs/events.out.tfevents.1737732221.kml-dtmachine-18088-prod.99991.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fffcad32f443793feb0be043504c78480fa5a2b18738e022a04fbfd73f11244c +size 31769 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a96100c290cb2a114bfb33f6faa484b8691229bc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8f9c28fe980ab33e81bda2515830773520b89731e80cedd3142829c4c1cd32 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcb22559807bbe9fe6bc67321ee125f4b1a80a3c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e7f692b88ecb9764b06b77d6a10f25e388095d7d1eeefb77d932d413e9fcdb +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8b05ecb86b9dfc696c8e29a1e6bd7a6308c34de --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:788a8f07cbba70a0b4d58958cc82eedb60466ad8f0a59f05bbb8b33af892bc87 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a43cd493bfff38acaa692b61da877d08808c8d03 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e06976ef4c3c32acc07db90f2553b5c853042b8ca1a43ed089e4b947ac8238 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9565d4714cc1d67720e7184ba2c456961eb94fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/global_step102/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74075f57211a8dfa6b22041d44272db794a95351cb4b97b06c931f4957df6572 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..df03da272e4cf8eac220b6310b776802c3bf8e63 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/latest @@ -0,0 +1 @@ +global_step102 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef36dc3f6cf7236807d43493bc6f3cb8df3fa50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30237d0f5baf1ca43a12c2aabc20a4324a5f8e18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc3660a08ecc14e0d76b493786a7ef538104b01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0565e80b661a234a9191c62faedc17b1ae5aa23c9527cc63349cbee8ced8b51d +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3021cec3205bac6bb31bbe558257eaa62ee692b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.44824219, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100", + "epoch": 4.161616161616162, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09053125249136239, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.6171875, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.672528076171875, + "memory(GiB)": 52.27, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.6875, + "rewards/rejected": 7.1875, + "step": 25, + "train_speed(iter/s)": 0.025025 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06259364123158316, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -454.0, + "loss": 0.57813720703125, + "memory(GiB)": 52.27, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.375, + "rewards/rejected": 5.59375, + "step": 30, + "train_speed(iter/s)": 0.025309 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06341948959916613, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.57001953125, + "memory(GiB)": 52.27, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 9.3125, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.025272 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.05322574754844299, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -474.0, + "loss": 0.542498779296875, + "memory(GiB)": 52.27, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.8125, + "rewards/rejected": 4.71875, + "step": 40, + "train_speed(iter/s)": 0.025462 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5, + "eval_logps/chosen": -792.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.46728515625, + "eval_nll_loss": 0.55859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.125, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 4.518, + "eval_samples_per_second": 0.885, + "eval_steps_per_second": 0.443, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.06288771891169008, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5608245849609375, + "memory(GiB)": 52.27, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.625, + "rewards/rejected": 5.09375, + "step": 45, + "train_speed(iter/s)": 0.025554 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.05520286247644814, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.57568359375, + "memory(GiB)": 52.27, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.025458 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03187452398084299, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.6171875, + "logps/chosen": -342.0, + "logps/rejected": -382.0, + "loss": 0.525238037109375, + "memory(GiB)": 52.27, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 9.75, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02552 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04272549422103994, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.529901123046875, + "memory(GiB)": 52.27, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.025575 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.4541015625, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 11.0, + "eval_rewards/rejected": 6.3125, + "eval_runtime": 4.5254, + "eval_samples_per_second": 0.884, + "eval_steps_per_second": 0.442, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0790840513667992, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6796875, + "logits/rejected": -1.6953125, + "logps/chosen": -358.0, + "logps/rejected": -496.0, + "loss": 0.537469482421875, + "memory(GiB)": 52.27, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 9.9375, + "rewards/rejected": 5.6875, + "step": 65, + "train_speed(iter/s)": 0.025599 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.045626680740731757, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.546875, + "logits/rejected": -1.703125, + "logps/chosen": -253.0, + "logps/rejected": -532.0, + "loss": 0.49605712890625, + "memory(GiB)": 53.72, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.4375, + "rewards/rejected": 5.25, + "step": 70, + "train_speed(iter/s)": 0.025673 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.05430409750149703, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.484375, + "logps/chosen": -368.0, + "logps/rejected": -408.0, + "loss": 0.562591552734375, + "memory(GiB)": 53.72, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 10.1875, + "rewards/rejected": 6.4375, + "step": 75, + "train_speed(iter/s)": 0.025585 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04354967811022926, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.5546875, + "logps/chosen": -346.0, + "logps/rejected": -352.0, + "loss": 0.4714996337890625, + "memory(GiB)": 53.72, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 10.5625, + "rewards/rejected": 5.9375, + "step": 80, + "train_speed(iter/s)": 0.025651 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.359375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4521484375, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.5, + "eval_rewards/rejected": 6.1875, + "eval_runtime": 4.3906, + "eval_samples_per_second": 0.911, + "eval_steps_per_second": 0.456, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.047973918415174815, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.6328125, + "logps/chosen": -400.0, + "logps/rejected": -476.0, + "loss": 0.50703125, + "memory(GiB)": 53.72, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.125, + "rewards/margins": 11.6875, + "rewards/rejected": 5.40625, + "step": 85, + "train_speed(iter/s)": 0.025587 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.043401090316595184, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -302.0, + "logps/rejected": -468.0, + "loss": 0.4710205078125, + "memory(GiB)": 53.72, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 11.0625, + "rewards/rejected": 5.0625, + "step": 90, + "train_speed(iter/s)": 0.025624 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.04969693397822946, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6953125, + "logps/chosen": -258.0, + "logps/rejected": -490.0, + "loss": 0.5209747314453125, + "memory(GiB)": 53.72, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 10.125, + "rewards/rejected": 5.40625, + "step": 95, + "train_speed(iter/s)": 0.025657 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.04492084211332035, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -1.625, + "logits/rejected": -1.765625, + "logps/chosen": -250.0, + "logps/rejected": -486.0, + "loss": 0.547564697265625, + "memory(GiB)": 53.72, + "nll_loss": 0.44140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 9.8125, + "rewards/rejected": 5.15625, + "step": 100, + "train_speed(iter/s)": 0.025612 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.34375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4482421875, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.625, + "eval_rewards/rejected": 6.0625, + "eval_runtime": 4.1505, + "eval_samples_per_second": 0.964, + "eval_steps_per_second": 0.482, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 53748920221696.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1cd1b04b261c19857aa100e8c6ba9f3d60ee0d96 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbe2bbc86ebf573f89000d4a25d729d3061eb7ea76ce043859cdb173ce73d97 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28359e3fe9451a278fb1aa88564b8731fbd5e989 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9589b17e5e7b8c407b4c605b5500751d354bc1ec04d874cad2a977c5ef732a41 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b75f3bdc81d89240e69eb109e313268f46a5a69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff29629f6bc08b729e80347a093aad8d73965c637af991937703845f1b03a398 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07e49e6a39e634ef967b0cdb08d1abd68382ab16 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f261bc165e44a0fc58fc535ebc035e51f6ecdd266757157105df194e4a4cdde +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f8a60f4f806b79bd8d88f6131442596235d203 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/global_step122/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf51e090971937103ddd1c3f612aaa557cbf4ecbdcf1e2d1c0012c06900b9169 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..1ff406405418d84068458850f74aecfc6224f793 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/latest @@ -0,0 +1 @@ +global_step122 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a49f44ba05d98a84fd55c18c4fa41c6437c8853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..79ef7e8924723bd699efa313eb78103d80b7edb9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40007a79aad967206b797079ca5147beff46ee1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede1043a0735266b510faa06f578fa6ef180c11e994a142a88a13ac6f33eb78b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf132b72044f0fc25b000bdc200f4b753d76c3df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.44824219, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100", + "epoch": 4.96969696969697, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09053125249136239, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.6171875, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.672528076171875, + "memory(GiB)": 52.27, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.6875, + "rewards/rejected": 7.1875, + "step": 25, + "train_speed(iter/s)": 0.025025 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06259364123158316, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -454.0, + "loss": 0.57813720703125, + "memory(GiB)": 52.27, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.375, + "rewards/rejected": 5.59375, + "step": 30, + "train_speed(iter/s)": 0.025309 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06341948959916613, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.57001953125, + "memory(GiB)": 52.27, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 9.3125, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.025272 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.05322574754844299, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -474.0, + "loss": 0.542498779296875, + "memory(GiB)": 52.27, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.8125, + "rewards/rejected": 4.71875, + "step": 40, + "train_speed(iter/s)": 0.025462 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5, + "eval_logps/chosen": -792.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.46728515625, + "eval_nll_loss": 0.55859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.125, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 4.518, + "eval_samples_per_second": 0.885, + "eval_steps_per_second": 0.443, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.06288771891169008, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5608245849609375, + "memory(GiB)": 52.27, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.625, + "rewards/rejected": 5.09375, + "step": 45, + "train_speed(iter/s)": 0.025554 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.05520286247644814, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.57568359375, + "memory(GiB)": 52.27, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.025458 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03187452398084299, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.6171875, + "logps/chosen": -342.0, + "logps/rejected": -382.0, + "loss": 0.525238037109375, + "memory(GiB)": 52.27, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 9.75, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02552 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04272549422103994, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.529901123046875, + "memory(GiB)": 52.27, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.025575 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.4541015625, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 11.0, + "eval_rewards/rejected": 6.3125, + "eval_runtime": 4.5254, + "eval_samples_per_second": 0.884, + "eval_steps_per_second": 0.442, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0790840513667992, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6796875, + "logits/rejected": -1.6953125, + "logps/chosen": -358.0, + "logps/rejected": -496.0, + "loss": 0.537469482421875, + "memory(GiB)": 52.27, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 9.9375, + "rewards/rejected": 5.6875, + "step": 65, + "train_speed(iter/s)": 0.025599 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.045626680740731757, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.546875, + "logits/rejected": -1.703125, + "logps/chosen": -253.0, + "logps/rejected": -532.0, + "loss": 0.49605712890625, + "memory(GiB)": 53.72, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.4375, + "rewards/rejected": 5.25, + "step": 70, + "train_speed(iter/s)": 0.025673 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.05430409750149703, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.484375, + "logps/chosen": -368.0, + "logps/rejected": -408.0, + "loss": 0.562591552734375, + "memory(GiB)": 53.72, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 10.1875, + "rewards/rejected": 6.4375, + "step": 75, + "train_speed(iter/s)": 0.025585 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04354967811022926, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.5546875, + "logps/chosen": -346.0, + "logps/rejected": -352.0, + "loss": 0.4714996337890625, + "memory(GiB)": 53.72, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 10.5625, + "rewards/rejected": 5.9375, + "step": 80, + "train_speed(iter/s)": 0.025651 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.359375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4521484375, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.5, + "eval_rewards/rejected": 6.1875, + "eval_runtime": 4.3906, + "eval_samples_per_second": 0.911, + "eval_steps_per_second": 0.456, + "step": 80 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.047973918415174815, + "learning_rate": 2.150959712448669e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.6328125, + "logps/chosen": -400.0, + "logps/rejected": -476.0, + "loss": 0.50703125, + "memory(GiB)": 53.72, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.125, + "rewards/margins": 11.6875, + "rewards/rejected": 5.40625, + "step": 85, + "train_speed(iter/s)": 0.025587 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.043401090316595184, + "learning_rate": 1.6135921418712956e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.5625, + "logps/chosen": -302.0, + "logps/rejected": -468.0, + "loss": 0.4710205078125, + "memory(GiB)": 53.72, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 11.0625, + "rewards/rejected": 5.0625, + "step": 90, + "train_speed(iter/s)": 0.025624 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.04969693397822946, + "learning_rate": 1.1404167454183957e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6953125, + "logps/chosen": -258.0, + "logps/rejected": -490.0, + "loss": 0.5209747314453125, + "memory(GiB)": 53.72, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 10.125, + "rewards/rejected": 5.40625, + "step": 95, + "train_speed(iter/s)": 0.025657 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 0.04492084211332035, + "learning_rate": 7.404029558083653e-06, + "logits/chosen": -1.625, + "logits/rejected": -1.765625, + "logps/chosen": -250.0, + "logps/rejected": -486.0, + "loss": 0.547564697265625, + "memory(GiB)": 53.72, + "nll_loss": 0.44140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 9.8125, + "rewards/rejected": 5.15625, + "step": 100, + "train_speed(iter/s)": 0.025612 + }, + { + "epoch": 4.161616161616162, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.34375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4482421875, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.625, + "eval_rewards/rejected": 6.0625, + "eval_runtime": 4.1505, + "eval_samples_per_second": 0.964, + "eval_steps_per_second": 0.482, + "step": 100 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.05652229958262343, + "learning_rate": 4.2113336672471245e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.5859375, + "logps/chosen": -284.0, + "logps/rejected": -436.0, + "loss": 0.485595703125, + "memory(GiB)": 53.72, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 10.75, + "rewards/rejected": 5.28125, + "step": 105, + "train_speed(iter/s)": 0.025612 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 0.05482775842378465, + "learning_rate": 1.8865999845374793e-06, + "logits/chosen": -1.6875, + "logits/rejected": -1.6171875, + "logps/chosen": -422.0, + "logps/rejected": -592.0, + "loss": 0.53631591796875, + "memory(GiB)": 53.72, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.0, + "rewards/margins": 11.0625, + "rewards/rejected": 5.90625, + "step": 110, + "train_speed(iter/s)": 0.025642 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 0.0914572949853764, + "learning_rate": 4.738957681248379e-07, + "logits/chosen": -1.6796875, + "logits/rejected": -1.546875, + "logps/chosen": -378.0, + "logps/rejected": -428.0, + "loss": 0.5370529174804688, + "memory(GiB)": 53.72, + "nll_loss": 0.5546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 11.3125, + "rewards/rejected": 5.875, + "step": 115, + "train_speed(iter/s)": 0.025681 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.05182072182744844, + "learning_rate": 0.0, + "logits/chosen": -1.5703125, + "logits/rejected": -1.484375, + "logps/chosen": -390.0, + "logps/rejected": -528.0, + "loss": 0.45579071044921876, + "memory(GiB)": 53.72, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 11.3125, + "rewards/rejected": 5.96875, + "step": 120, + "train_speed(iter/s)": 0.025675 + }, + { + "epoch": 4.96969696969697, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.3515625, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -225.0, + "eval_loss": 0.44873046875, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.75, + "eval_rewards/rejected": 6.0, + "eval_runtime": 4.0424, + "eval_samples_per_second": 0.99, + "eval_steps_per_second": 0.495, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 64331810078720.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4bfba62689378d12f53f8c144b50e7f53149f944 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c5f3b969509ba4b6fd0a0012c682b740839d2f8858533f56ccdf447ad7738b +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df6e4f8da9895a9bdf9fde74fb3a961bf924e10f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3261f89a2c3ed6a8389c4b4e64a7dc8f7ed8bb40a3731618a95fd3140e1d57cd +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1c25eb702339f7e58974af16ef7e2ede1a0cae0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb2f0e157b4fb1a59708c22b08096c6b96427498961e755062156a4517e88d02 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17cea2764a01871f6a1a26b88474fb83432e2342 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd8d20e0daa19f3da74a53ded05c8904aab682db343ed38f6419a9e8b219d47 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13dbdb5c086a404044eed7d3073796d64d71c67d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4351eebfddfbc95fb4f0c31aa99c4669a970f5d1dd67f76652cef5195f669754 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d46a9ba7690e83fef48d0cf5f4c34bd9df6cc737 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23784d04394ff924f7fca03236f62241ce5f4b6e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6dd3673e956bd77979f9bea8dac8586ade6b37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63645486a3e3e7ae3df0b90ccdad6a88372bee5e36403d533a3b105e3057dd94 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1b1ea2e961af6eeb1788da6e4f0546138774b6f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.55615234, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 11038730616832.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4e0305d7a43b5dbfdb4e86f289f34ed61ac3f70c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538c825ae08e263cfe8340d4da9c5399aa7a594d8668075aea8a915efafe4b8d +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3d8f8f4edd2460738c18dd77d7c7ac50ea35942 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d80393c5332ed48eacc4106f1f1c5bd9e23be66bffa61675addc3055d0d65d4b +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d398443c23b1aa80b8d7d1515f9b8e7d9d5748c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8eca05a9c61b0ff852d7de3e043d457320a8e20ea689cae0221bf91bcd2668 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..200e6b2143a228c693b631a71e9f3f8cd613d916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f261a0a7746bfdfbe379d61c4b9a1a3101c15fb35a7727443bb97548aa3453 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f92a5227f433763962cf4fe746c5f3b9fc1e078 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a53d8136ed04e678c2604a1b84bd5830e6503d57715c1347a37f71f219591523 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e39cd89edd6409a9e49b8db7f0d371695a2623d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9affc1541e7e94c18354d5173bc55400c5f07faf3d080c6d453d48e7a8d6ac3 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1b839d26b0a64f427c73c634fb491ba9ddf3381 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748c3ebf0e4c051c58b92e4a8c5b87cdb39d55cfdc2aec81a1baef0f02fc113 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..121c743afef4c40e4572ecca4130174de738c541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a37c9eca873a12cf719701e73828f6bf8d478061339d68aed6984c89fbba68 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4fec1dce43dfc604481b2c23697654f85716edf6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.46728516, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40", + "epoch": 1.6464646464646466, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09053125249136239, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.6171875, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.672528076171875, + "memory(GiB)": 52.27, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.6875, + "rewards/rejected": 7.1875, + "step": 25, + "train_speed(iter/s)": 0.025025 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06259364123158316, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -454.0, + "loss": 0.57813720703125, + "memory(GiB)": 52.27, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.375, + "rewards/rejected": 5.59375, + "step": 30, + "train_speed(iter/s)": 0.025309 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06341948959916613, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.57001953125, + "memory(GiB)": 52.27, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 9.3125, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.025272 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.05322574754844299, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -474.0, + "loss": 0.542498779296875, + "memory(GiB)": 52.27, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.8125, + "rewards/rejected": 4.71875, + "step": 40, + "train_speed(iter/s)": 0.025462 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5, + "eval_logps/chosen": -792.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.46728515625, + "eval_nll_loss": 0.55859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.125, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 4.518, + "eval_samples_per_second": 0.885, + "eval_steps_per_second": 0.443, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 22611208306688.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9fca202c2f29f0d5d524bce7cab3262179a4eb2f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608b3fc476792eb6734bfd45be77ac46d6c3b6f462784c5aef479b67001478c2 +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc2bcae2c6ebb9a7e4fcc2961895f650e9f50809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead703215d6b5b47f85ee177622bc47ca3db08b0ba673487eaa84d24314dcfca +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c80c3933d94a027849ac4b0a50ae4e97c1563cc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ccf45ab2e8a4894ff2c1cb4cba08f2e3f569a20aee1a80d1ea9f3c395409b85 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c33a8f6b932e36445db5646999b5b2be7158d41 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c8fcfc5a7e3a3997ccd385accb83d8a322b4cdd7020d7f9ec9dbf789f4948f +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8afdbe4e7073be1f1d59805bedc3570347b027a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/global_step61/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4329013567adcc4dbe71958d8df58ceadb8514e5968a5074267338076d097d36 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..2ab068060bb48c11f977b3517d525b0c1d1b451e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/latest @@ -0,0 +1 @@ +global_step61 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..07a546a3d8fa499648a42db76ea9733d09e5ca98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a17ffe4d1cfad70857491e1fd7e427c0413a789e2cb4398c4af3ca8efd92a5 +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5552726456b4cc7d1cc941b486f870e723d6ab42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8430d63cfb7960c36461376f5e1ef952c23b5128eae3a1f763753f4c308fd4aa +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76668aa0c95685940d77d7c5ce6c001f6db7c2c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713c005c4b73241d6fc347c8a1ef4929922d2b3ba4d5e78a796f5a8d398fff6b +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5397ed41c59b44196ce626d4e46c8d2e508f0b2e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.45410156, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60", + "epoch": 2.484848484848485, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09053125249136239, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.6171875, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.672528076171875, + "memory(GiB)": 52.27, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.6875, + "rewards/rejected": 7.1875, + "step": 25, + "train_speed(iter/s)": 0.025025 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06259364123158316, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -454.0, + "loss": 0.57813720703125, + "memory(GiB)": 52.27, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.375, + "rewards/rejected": 5.59375, + "step": 30, + "train_speed(iter/s)": 0.025309 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06341948959916613, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.57001953125, + "memory(GiB)": 52.27, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 9.3125, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.025272 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.05322574754844299, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -474.0, + "loss": 0.542498779296875, + "memory(GiB)": 52.27, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.8125, + "rewards/rejected": 4.71875, + "step": 40, + "train_speed(iter/s)": 0.025462 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5, + "eval_logps/chosen": -792.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.46728515625, + "eval_nll_loss": 0.55859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.125, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 4.518, + "eval_samples_per_second": 0.885, + "eval_steps_per_second": 0.443, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.06288771891169008, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5608245849609375, + "memory(GiB)": 52.27, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.625, + "rewards/rejected": 5.09375, + "step": 45, + "train_speed(iter/s)": 0.025554 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.05520286247644814, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.57568359375, + "memory(GiB)": 52.27, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.025458 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03187452398084299, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.6171875, + "logps/chosen": -342.0, + "logps/rejected": -382.0, + "loss": 0.525238037109375, + "memory(GiB)": 52.27, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 9.75, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02552 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04272549422103994, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.529901123046875, + "memory(GiB)": 52.27, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.025575 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.4541015625, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 11.0, + "eval_rewards/rejected": 6.3125, + "eval_runtime": 4.5254, + "eval_samples_per_second": 0.884, + "eval_steps_per_second": 0.442, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 32332961415168.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2527ccd4aab1eb580cb2394089d34d76ec51d555 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "down_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d4fd2a7ad726caa7a6f04643967b9926afae56cf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf1d677159e010e3c294c146dc0e01f771e267d65ecc161ee914cdd87ea0f4d +size 68902296 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..22839ebae6bbe80fd5daf71f2c84d0fb3bcc38fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 2, + "local_world_size": 2, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=8192, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d21fe560ded634eadc3540ce707ecd0cf4b954a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c82abe2d456f92582f348f302f2d329da2e0b4cdb56b48f07c1b9a6799d6e57 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68f24329a4f086c8af814b3dc8e76fd56276499d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda406d6c7393fbcbb2cdeee2ba298abb385574b6abd81e34c7193705c12e158 +size 206442416 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d8c7c4bf7c9f614a9deb6d953c0232042518406 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f51a19a37f1004f4021b66cc177a82f08af3abc5519c61ff90b124392a844b4 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68c431fa12e41f18e7ed8843913a786faa1e141b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/global_step81/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fc6136fd667208aa14d49b962c185b4c43086df2120b8a85aae19aa237e0c9 +size 664974 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..2bd8ce78b3973ba141f9add734eb607bba845fa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/latest @@ -0,0 +1 @@ +global_step81 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e5aaa9ad20a7866eaa33077664ec220ecdb611 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcb161b22b2558dbf7e3f8c871050cec383d11a40423fab11f18d5e630639bf +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4628fa33f0faa593d13444a49e4f052ec6824760 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50af6aef769414a6f28fa1b1bc51ce707dc8ecd15474e03f99a2f10fde086be +size 14512 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffbf7f85d5da821857de440c287be705c5aacef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b4e5115d96ef71a04742fa60d525add0a4d6781efee6e4052ec0c78c7d6c19 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c0f1d07b771de85d44498c1d9e5ddb1546e6b9b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.45214844, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80", + "epoch": 3.323232323232323, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 0.9619308660072415, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.9296875, + "logits/rejected": -1.7265625, + "logps/chosen": -466.0, + "logps/rejected": -916.0, + "loss": 1.849609375, + "memory(GiB)": 49.37, + "nll_loss": 1.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.014445 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.9539301708536683, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.609375, + "logps/chosen": -576.0, + "logps/rejected": -552.0, + "loss": 2.4267578125, + "memory(GiB)": 49.37, + "nll_loss": 1.453125, + "rewards/accuracies": 0.15625, + "rewards/chosen": 0.0281982421875, + "rewards/margins": -0.00311279296875, + "rewards/rejected": 0.03125, + "step": 5, + "train_speed(iter/s)": 0.022275 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.7674288621413258, + "learning_rate": 9.969653386589748e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -510.0, + "logps/rejected": -488.0, + "loss": 1.6552734375, + "memory(GiB)": 49.37, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4296875, + "rewards/margins": 0.9921875, + "rewards/rejected": 0.43359375, + "step": 10, + "train_speed(iter/s)": 0.024195 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.49307127908285064, + "learning_rate": 9.847001329696653e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6640625, + "logps/chosen": -462.0, + "logps/rejected": -488.0, + "loss": 1.51802978515625, + "memory(GiB)": 50.82, + "nll_loss": 1.1875, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": 3.53125, + "step": 15, + "train_speed(iter/s)": 0.024958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12751888679721451, + "learning_rate": 9.632470336074009e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.8515625, + "logps/chosen": -294.0, + "logps/rejected": -450.0, + "loss": 0.8334716796875, + "memory(GiB)": 50.82, + "nll_loss": 0.7734375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 10.0625, + "rewards/margins": 4.21875, + "rewards/rejected": 5.875, + "step": 20, + "train_speed(iter/s)": 0.025382 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.515625, + "eval_logps/chosen": -812.0, + "eval_logps/rejected": -218.0, + "eval_loss": 0.55615234375, + "eval_nll_loss": 0.6875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.25, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 6.75, + "eval_runtime": 4.2332, + "eval_samples_per_second": 0.945, + "eval_steps_per_second": 0.472, + "step": 20 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.09053125249136239, + "learning_rate": 9.330127018922194e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.6171875, + "logps/chosen": -370.0, + "logps/rejected": -438.0, + "loss": 0.672528076171875, + "memory(GiB)": 52.27, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.875, + "rewards/margins": 5.6875, + "rewards/rejected": 7.1875, + "step": 25, + "train_speed(iter/s)": 0.025025 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.06259364123158316, + "learning_rate": 8.945702546981969e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.703125, + "logps/chosen": -370.0, + "logps/rejected": -454.0, + "loss": 0.57813720703125, + "memory(GiB)": 52.27, + "nll_loss": 0.62109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 8.375, + "rewards/rejected": 5.59375, + "step": 30, + "train_speed(iter/s)": 0.025309 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.06341948959916613, + "learning_rate": 8.486484005469977e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.59375, + "logps/chosen": -398.0, + "logps/rejected": -384.0, + "loss": 0.57001953125, + "memory(GiB)": 52.27, + "nll_loss": 0.58203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 9.3125, + "rewards/rejected": 5.25, + "step": 35, + "train_speed(iter/s)": 0.025272 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.05322574754844299, + "learning_rate": 7.961176263324901e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7578125, + "logps/chosen": -372.0, + "logps/rejected": -474.0, + "loss": 0.542498779296875, + "memory(GiB)": 52.27, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5, + "rewards/margins": 9.8125, + "rewards/rejected": 4.71875, + "step": 40, + "train_speed(iter/s)": 0.025462 + }, + { + "epoch": 1.6464646464646466, + "eval_logits/chosen": -1.6875, + "eval_logits/rejected": -1.5, + "eval_logps/chosen": -792.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.46728515625, + "eval_nll_loss": 0.55859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 16.125, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": 5.5625, + "eval_runtime": 4.518, + "eval_samples_per_second": 0.885, + "eval_steps_per_second": 0.443, + "step": 40 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.06288771891169008, + "learning_rate": 7.379736965185368e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.78125, + "logps/chosen": -296.0, + "logps/rejected": -540.0, + "loss": 0.5608245849609375, + "memory(GiB)": 52.27, + "nll_loss": 0.58984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.625, + "rewards/rejected": 5.09375, + "step": 45, + "train_speed(iter/s)": 0.025554 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.05520286247644814, + "learning_rate": 6.753187775963773e-05, + "logits/chosen": -1.6953125, + "logits/rejected": -1.640625, + "logps/chosen": -302.0, + "logps/rejected": -418.0, + "loss": 0.57568359375, + "memory(GiB)": 52.27, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 9.3125, + "rewards/rejected": 5.53125, + "step": 50, + "train_speed(iter/s)": 0.025458 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.03187452398084299, + "learning_rate": 6.09340545603188e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.6171875, + "logps/chosen": -342.0, + "logps/rejected": -382.0, + "loss": 0.525238037109375, + "memory(GiB)": 52.27, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 9.75, + "rewards/rejected": 5.59375, + "step": 55, + "train_speed(iter/s)": 0.02552 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.04272549422103994, + "learning_rate": 5.4128967273616625e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.6171875, + "logps/chosen": -358.0, + "logps/rejected": -374.0, + "loss": 0.529901123046875, + "memory(GiB)": 52.27, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 10.4375, + "rewards/rejected": 5.78125, + "step": 60, + "train_speed(iter/s)": 0.025575 + }, + { + "epoch": 2.484848484848485, + "eval_logits/chosen": -1.65625, + "eval_logits/rejected": -1.4140625, + "eval_logps/chosen": -780.0, + "eval_logps/rejected": -222.0, + "eval_loss": 0.4541015625, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.25, + "eval_rewards/margins": 11.0, + "eval_rewards/rejected": 6.3125, + "eval_runtime": 4.5254, + "eval_samples_per_second": 0.884, + "eval_steps_per_second": 0.442, + "step": 60 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.0790840513667992, + "learning_rate": 4.7245611982206724e-05, + "logits/chosen": -1.6796875, + "logits/rejected": -1.6953125, + "logps/chosen": -358.0, + "logps/rejected": -496.0, + "loss": 0.537469482421875, + "memory(GiB)": 52.27, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 9.9375, + "rewards/rejected": 5.6875, + "step": 65, + "train_speed(iter/s)": 0.025599 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.045626680740731757, + "learning_rate": 4.0414468403813095e-05, + "logits/chosen": -1.546875, + "logits/rejected": -1.703125, + "logps/chosen": -253.0, + "logps/rejected": -532.0, + "loss": 0.49605712890625, + "memory(GiB)": 53.72, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.6875, + "rewards/margins": 9.4375, + "rewards/rejected": 5.25, + "step": 70, + "train_speed(iter/s)": 0.025673 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.05430409750149703, + "learning_rate": 3.3765026539765834e-05, + "logits/chosen": -1.78125, + "logits/rejected": -1.484375, + "logps/chosen": -368.0, + "logps/rejected": -408.0, + "loss": 0.562591552734375, + "memory(GiB)": 53.72, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 10.1875, + "rewards/rejected": 6.4375, + "step": 75, + "train_speed(iter/s)": 0.025585 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 0.04354967811022926, + "learning_rate": 2.7423332084455544e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.5546875, + "logps/chosen": -346.0, + "logps/rejected": -352.0, + "loss": 0.4714996337890625, + "memory(GiB)": 53.72, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 10.5625, + "rewards/rejected": 5.9375, + "step": 80, + "train_speed(iter/s)": 0.025651 + }, + { + "epoch": 3.323232323232323, + "eval_logits/chosen": -1.640625, + "eval_logits/rejected": -1.359375, + "eval_logps/chosen": -776.0, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4521484375, + "eval_nll_loss": 0.5390625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 17.75, + "eval_rewards/margins": 11.5, + "eval_rewards/rejected": 6.1875, + "eval_runtime": 4.3906, + "eval_samples_per_second": 0.911, + "eval_steps_per_second": 0.456, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 120, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 42772328611840.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c9d81afcc7347a9ba35b3d69447fe58fd3c263 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087a93b258a0db88ffd492c4fbe7e8ec1ee0f9b749005d8f9baa0a9c1e0ac44 +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..368606f1f91bf72c6fb658a422750ebce4cc50fe Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f29880d1230768c30792aa9dedfe2454c50e961e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..3d35a880993ceeb48269d9eb90d583dbe4ba2e1b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c6ab09fbc122d1489736b230694fb1294b2a30da Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..e6c940083b96967d9abc6f40546952a1b7106f30 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..d8eb491fa2a43ff3cf2abb2f67df1eb5b4813972 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..8cead5875b75d617a13f1247e15b17c39a5168b6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..9117fb1e1bf882dc8e76d4801dc5d0f91f782cb2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..8af1759b5aa17d3847bab0d934c5d3deda6be0d2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..035a751d5a052bbeac6d00f213b1ca1637da8dfc Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..1067daf626f49f13be6303dff19d432c76571b1a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..76c3a63c52f737d8888ba456ba297d96870dfa86 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..9573adedf5fbc0f6e22419f172163f7fe4e5e382 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..2c72126dce5328d7e5cbf3f3ba730b04185f2ff2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..115507aeb505f38436e92977e61c96d75c0b4266 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..a138c3207804ef5be69dc8ff0d41e1abf82a564e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..1bbc3b943c366610150cbe8dfc061ee03c11088a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..cd72f4b3f786a9793cfa817379d579d2c3b4117d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..220fa0a7c65e884d510c01453827740b63413b75 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..de1a35c6ee67281c5ec2edce207af214f57dc6d5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..35fae4e08e6ff795d20986d08ad147aa4526885f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..733e00e4c2a3acb6fc05f647cbfe4304f11be7be Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..38f51b3fc8813647180c933f9de164aab34b71c5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..0890a3ee8f75df19ca7475f1a5b65ec524fa9ac6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2ecbde9a97416e120a36d3686b4d757f078742b3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..3f6df532f3c4c2d1d2d45e16e9e0396995be9596 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..6dbc5c1ffdc7546c026039bb427f51ad5c616ef3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..a98479d11a58d80f0eb4e32a67f5a653e3f16611 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..bd1f7356ddf4b2ec3e219f9d0f9282af2bf9b119 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..ad13f7f273ffe0a78a9a1b30a6302de990295fd3 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..ff2e8dddbbbff905c4e9849841371a8d84b479fd Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..8613698b219be945f18378fb1d26ea4da14090d0 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..836fc6ae43413cd867a5d01e5502139388d64a57 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/logging.jsonl b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7776aea21655ad5f1d132418075311ed2f995b8a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/logging.jsonl @@ -0,0 +1,33 @@ +{"loss": 1.84960938, "grad_norm": 0.96193087, "learning_rate": 1.667e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.014445, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -916.0, "logps/chosen": -466.0, "logits/rejected": -1.7265625, "logits/chosen": -1.9296875, "nll_loss": 1.4765625, "epoch": 0.04040404, "global_step/max_steps": "1/120", "percentage": "0.83%", "elapsed_time": "1m 0s", "remaining_time": "2h 0m 6s"} +{"loss": 2.42675781, "grad_norm": 0.95393017, "learning_rate": 8.333e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.022275, "rewards/chosen": 0.02819824, "rewards/rejected": 0.03125, "rewards/accuracies": 0.15625, "rewards/margins": -0.00311279, "logps/rejected": -552.0, "logps/chosen": -576.0, "logits/rejected": -1.609375, "logits/chosen": -1.625, "nll_loss": 1.453125, "epoch": 0.2020202, "global_step/max_steps": "5/120", "percentage": "4.17%", "elapsed_time": "3m 35s", "remaining_time": "1h 22m 43s"} +{"loss": 1.65527344, "grad_norm": 0.76742886, "learning_rate": 9.97e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.024195, "rewards/chosen": 1.4296875, "rewards/rejected": 0.43359375, "rewards/accuracies": 0.69999999, "rewards/margins": 0.9921875, "logps/rejected": -488.0, "logps/chosen": -510.0, "logits/rejected": -1.46875, "logits/chosen": -1.7421875, "nll_loss": 0.98046875, "epoch": 0.4040404, "global_step/max_steps": "10/120", "percentage": "8.33%", "elapsed_time": "6m 44s", "remaining_time": "1h 14m 11s"} +{"loss": 1.51802979, "grad_norm": 0.49307128, "learning_rate": 9.847e-05, "memory(GiB)": 50.82, "train_speed(iter/s)": 0.024958, "rewards/chosen": 6.0625, "rewards/rejected": 3.53125, "rewards/accuracies": 0.89999998, "rewards/margins": 2.546875, "logps/rejected": -488.0, "logps/chosen": -462.0, "logits/rejected": -1.6640625, "logits/chosen": -1.828125, "nll_loss": 1.1875, "epoch": 0.60606061, "global_step/max_steps": "15/120", "percentage": "12.50%", "elapsed_time": "9m 52s", "remaining_time": "1h 9m 6s"} +{"loss": 0.83347168, "grad_norm": 0.12751889, "learning_rate": 9.632e-05, "memory(GiB)": 50.82, "train_speed(iter/s)": 0.025382, "rewards/chosen": 10.0625, "rewards/rejected": 5.875, "rewards/accuracies": 0.97500002, "rewards/margins": 4.21875, "logps/rejected": -450.0, "logps/chosen": -294.0, "logits/rejected": -1.8515625, "logits/chosen": -1.7109375, "nll_loss": 0.7734375, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "12m 59s", "remaining_time": "1h 4m 56s"} +{"eval_loss": 0.55615234, "eval_runtime": 4.2332, "eval_samples_per_second": 0.945, "eval_steps_per_second": 0.472, "eval_rewards/chosen": 14.25, "eval_rewards/rejected": 6.75, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.4375, "eval_logps/rejected": -218.0, "eval_logps/chosen": -812.0, "eval_logits/rejected": -1.515625, "eval_logits/chosen": -1.625, "eval_nll_loss": 0.6875, "epoch": 0.80808081, "global_step/max_steps": "20/120", "percentage": "16.67%", "elapsed_time": "13m 3s", "remaining_time": "1h 5m 17s"} +{"loss": 0.67252808, "grad_norm": 0.09053125, "learning_rate": 9.33e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025025, "rewards/chosen": 12.875, "rewards/rejected": 7.1875, "rewards/accuracies": 1.0, "rewards/margins": 5.6875, "logps/rejected": -438.0, "logps/chosen": -370.0, "logits/rejected": -1.6171875, "logits/chosen": -1.7890625, "nll_loss": 0.609375, "epoch": 1.04040404, "global_step/max_steps": "25/120", "percentage": "20.83%", "elapsed_time": "16m 30s", "remaining_time": "1h 2m 43s"} +{"loss": 0.57813721, "grad_norm": 0.06259364, "learning_rate": 8.946e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025309, "rewards/chosen": 14.0, "rewards/rejected": 5.59375, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -454.0, "logps/chosen": -370.0, "logits/rejected": -1.703125, "logits/chosen": -1.796875, "nll_loss": 0.62109375, "epoch": 1.24242424, "global_step/max_steps": "30/120", "percentage": "25.00%", "elapsed_time": "19m 36s", "remaining_time": "58m 50s"} +{"loss": 0.57001953, "grad_norm": 0.06341949, "learning_rate": 8.486e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025272, "rewards/chosen": 14.5625, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -384.0, "logps/chosen": -398.0, "logits/rejected": -1.59375, "logits/chosen": -1.8359375, "nll_loss": 0.58203125, "epoch": 1.44444444, "global_step/max_steps": "35/120", "percentage": "29.17%", "elapsed_time": "22m 56s", "remaining_time": "55m 42s"} +{"loss": 0.54249878, "grad_norm": 0.05322575, "learning_rate": 7.961e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025462, "rewards/chosen": 14.5, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -474.0, "logps/chosen": -372.0, "logits/rejected": -1.7578125, "logits/chosen": -1.7109375, "nll_loss": 0.57421875, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "26m 2s", "remaining_time": "52m 4s"} +{"eval_loss": 0.46728516, "eval_runtime": 4.518, "eval_samples_per_second": 0.885, "eval_steps_per_second": 0.443, "eval_rewards/chosen": 16.125, "eval_rewards/rejected": 5.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -792.0, "eval_logits/rejected": -1.5, "eval_logits/chosen": -1.6875, "eval_nll_loss": 0.55859375, "epoch": 1.64646465, "global_step/max_steps": "40/120", "percentage": "33.33%", "elapsed_time": "26m 6s", "remaining_time": "52m 13s"} +{"loss": 0.56082458, "grad_norm": 0.06288772, "learning_rate": 7.38e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025554, "rewards/chosen": 14.6875, "rewards/rejected": 5.09375, "rewards/accuracies": 1.0, "rewards/margins": 9.625, "logps/rejected": -540.0, "logps/chosen": -296.0, "logits/rejected": -1.78125, "logits/chosen": -1.609375, "nll_loss": 0.58984375, "epoch": 1.84848485, "global_step/max_steps": "45/120", "percentage": "37.50%", "elapsed_time": "29m 12s", "remaining_time": "48m 40s"} +{"loss": 0.57568359, "grad_norm": 0.05520286, "learning_rate": 6.753e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025458, "rewards/chosen": 14.875, "rewards/rejected": 5.53125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -418.0, "logps/chosen": -302.0, "logits/rejected": -1.640625, "logits/chosen": -1.6953125, "nll_loss": 0.50390625, "epoch": 2.08080808, "global_step/max_steps": "50/120", "percentage": "41.67%", "elapsed_time": "32m 35s", "remaining_time": "45m 37s"} +{"loss": 0.52523804, "grad_norm": 0.03187452, "learning_rate": 6.093e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.02552, "rewards/chosen": 15.375, "rewards/rejected": 5.59375, "rewards/accuracies": 1.0, "rewards/margins": 9.75, "logps/rejected": -382.0, "logps/chosen": -342.0, "logits/rejected": -1.6171875, "logits/chosen": -1.8515625, "nll_loss": 0.5390625, "epoch": 2.28282828, "global_step/max_steps": "55/120", "percentage": "45.83%", "elapsed_time": "35m 46s", "remaining_time": "42m 16s"} +{"loss": 0.52990112, "grad_norm": 0.04272549, "learning_rate": 5.413e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025575, "rewards/chosen": 16.25, "rewards/rejected": 5.78125, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -374.0, "logps/chosen": -358.0, "logits/rejected": -1.6171875, "logits/chosen": -1.8046875, "nll_loss": 0.51171875, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "38m 57s", "remaining_time": "38m 57s"} +{"eval_loss": 0.45410156, "eval_runtime": 4.5254, "eval_samples_per_second": 0.884, "eval_steps_per_second": 0.442, "eval_rewards/chosen": 17.25, "eval_rewards/rejected": 6.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.0, "eval_logps/rejected": -222.0, "eval_logps/chosen": -780.0, "eval_logits/rejected": -1.4140625, "eval_logits/chosen": -1.65625, "eval_nll_loss": 0.5390625, "epoch": 2.48484848, "global_step/max_steps": "60/120", "percentage": "50.00%", "elapsed_time": "39m 1s", "remaining_time": "39m 1s"} +{"loss": 0.53746948, "grad_norm": 0.07908405, "learning_rate": 4.725e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025599, "rewards/chosen": 15.625, "rewards/rejected": 5.6875, "rewards/accuracies": 1.0, "rewards/margins": 9.9375, "logps/rejected": -496.0, "logps/chosen": -358.0, "logits/rejected": -1.6953125, "logits/chosen": -1.6796875, "nll_loss": 0.515625, "epoch": 2.68686869, "global_step/max_steps": "65/120", "percentage": "54.17%", "elapsed_time": "42m 10s", "remaining_time": "35m 41s"} +{"loss": 0.49605713, "grad_norm": 0.04562668, "learning_rate": 4.041e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025673, "rewards/chosen": 14.6875, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.4375, "logps/rejected": -532.0, "logps/chosen": -253.0, "logits/rejected": -1.703125, "logits/chosen": -1.546875, "nll_loss": 0.48632812, "epoch": 2.88888889, "global_step/max_steps": "70/120", "percentage": "58.33%", "elapsed_time": "45m 17s", "remaining_time": "32m 21s"} +{"loss": 0.56259155, "grad_norm": 0.0543041, "learning_rate": 3.377e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025585, "rewards/chosen": 16.625, "rewards/rejected": 6.4375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -408.0, "logps/chosen": -368.0, "logits/rejected": -1.484375, "logits/chosen": -1.78125, "nll_loss": 0.47070312, "epoch": 3.12121212, "global_step/max_steps": "75/120", "percentage": "62.50%", "elapsed_time": "48m 42s", "remaining_time": "29m 13s"} +{"loss": 0.47149963, "grad_norm": 0.04354968, "learning_rate": 2.742e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025651, "rewards/chosen": 16.5, "rewards/rejected": 5.9375, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -352.0, "logps/chosen": -346.0, "logits/rejected": -1.5546875, "logits/chosen": -1.7265625, "nll_loss": 0.49023438, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "51m 50s", "remaining_time": "25m 55s"} +{"eval_loss": 0.45214844, "eval_runtime": 4.3906, "eval_samples_per_second": 0.911, "eval_steps_per_second": 0.456, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.5, "eval_logps/rejected": -223.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.359375, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.5390625, "epoch": 3.32323232, "global_step/max_steps": "80/120", "percentage": "66.67%", "elapsed_time": "51m 54s", "remaining_time": "25m 57s"} +{"loss": 0.50703125, "grad_norm": 0.04797392, "learning_rate": 2.151e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025587, "rewards/chosen": 17.125, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 11.6875, "logps/rejected": -476.0, "logps/chosen": -400.0, "logits/rejected": -1.6328125, "logits/chosen": -1.6953125, "nll_loss": 0.54296875, "epoch": 3.52525253, "global_step/max_steps": "85/120", "percentage": "70.83%", "elapsed_time": "55m 13s", "remaining_time": "22m 44s"} +{"loss": 0.47102051, "grad_norm": 0.04340109, "learning_rate": 1.614e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025624, "rewards/chosen": 16.125, "rewards/rejected": 5.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -468.0, "logps/chosen": -302.0, "logits/rejected": -1.5625, "logits/chosen": -1.609375, "nll_loss": 0.46484375, "epoch": 3.72727273, "global_step/max_steps": "90/120", "percentage": "75.00%", "elapsed_time": "58m 23s", "remaining_time": "19m 27s"} +{"loss": 0.52097473, "grad_norm": 0.04969693, "learning_rate": 1.14e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025657, "rewards/chosen": 15.5625, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 10.125, "logps/rejected": -490.0, "logps/chosen": -258.0, "logits/rejected": -1.6953125, "logits/chosen": -1.625, "nll_loss": 0.48632812, "epoch": 3.92929293, "global_step/max_steps": "95/120", "percentage": "79.17%", "elapsed_time": "1h 1m 34s", "remaining_time": "16m 12s"} +{"loss": 0.5475647, "grad_norm": 0.04492084, "learning_rate": 7.4e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025612, "rewards/chosen": 14.9375, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -486.0, "logps/chosen": -250.0, "logits/rejected": -1.765625, "logits/chosen": -1.625, "nll_loss": 0.44140625, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 4m 55s", "remaining_time": "12m 59s"} +{"eval_loss": 0.44824219, "eval_runtime": 4.1505, "eval_samples_per_second": 0.964, "eval_steps_per_second": 0.482, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.625, "eval_logps/rejected": -224.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.34375, "eval_logits/chosen": -1.6328125, "eval_nll_loss": 0.53515625, "epoch": 4.16161616, "global_step/max_steps": "100/120", "percentage": "83.33%", "elapsed_time": "1h 4m 59s", "remaining_time": "12m 59s"} +{"loss": 0.4855957, "grad_norm": 0.0565223, "learning_rate": 4.21e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025612, "rewards/chosen": 16.0, "rewards/rejected": 5.28125, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -436.0, "logps/chosen": -284.0, "logits/rejected": -1.5859375, "logits/chosen": -1.6328125, "nll_loss": 0.453125, "epoch": 4.36363636, "global_step/max_steps": "105/120", "percentage": "87.50%", "elapsed_time": "1h 8m 10s", "remaining_time": "9m 44s"} +{"loss": 0.53631592, "grad_norm": 0.05482776, "learning_rate": 1.89e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025642, "rewards/chosen": 17.0, "rewards/rejected": 5.90625, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -592.0, "logps/chosen": -422.0, "logits/rejected": -1.6171875, "logits/chosen": -1.6875, "nll_loss": 0.55859375, "epoch": 4.56565657, "global_step/max_steps": "110/120", "percentage": "91.67%", "elapsed_time": "1h 11m 21s", "remaining_time": "6m 29s"} +{"loss": 0.53705292, "grad_norm": 0.09145729, "learning_rate": 4.7e-07, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025681, "rewards/chosen": 17.25, "rewards/rejected": 5.875, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -428.0, "logps/chosen": -378.0, "logits/rejected": -1.546875, "logits/chosen": -1.6796875, "nll_loss": 0.5546875, "epoch": 4.76767677, "global_step/max_steps": "115/120", "percentage": "95.83%", "elapsed_time": "1h 14m 29s", "remaining_time": "3m 14s"} +{"loss": 0.45579071, "grad_norm": 0.05182072, "learning_rate": 0.0, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025675, "rewards/chosen": 17.25, "rewards/rejected": 5.96875, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -528.0, "logps/chosen": -390.0, "logits/rejected": -1.484375, "logits/chosen": -1.5703125, "nll_loss": 0.48242188, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 45s", "remaining_time": "0s"} +{"eval_loss": 0.44873047, "eval_runtime": 4.0424, "eval_samples_per_second": 0.99, "eval_steps_per_second": 0.495, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.75, "eval_logps/rejected": -225.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.3515625, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.53515625, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 49s", "remaining_time": "0s"} +{"train_runtime": 4670.8786, "train_samples_per_second": 0.424, "train_steps_per_second": 0.026, "total_flos": 64331810078720.0, "train_loss": 0.70841242, "epoch": 4.96969697, "global_step/max_steps": "120/120", "percentage": "100.00%", "elapsed_time": "1h 17m 50s", "remaining_time": "0s"} +{"train_dataset": "1175.542929±552.835821, min=300.000000, max=6095.000000, size=396", "val_dataset": "1179.000000±512.550973, min=698.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 14804.4401M Params (34.4064M Trainable [0.2324%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-120", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/checkpoint-100", "best_metric": 0.44824219, "global_step": 120, "log_history": [{"loss": 1.849609375, "grad_norm": 0.9619308660072415, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.014445, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -916.0, "logps/chosen": -466.0, "logits/rejected": -1.7265625, "logits/chosen": -1.9296875, "nll_loss": 1.4765625, "epoch": 0.04040404040404041, "step": 1}, {"loss": 2.4267578125, "grad_norm": 0.9539301708536683, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.022275, "rewards/chosen": 0.0281982421875, "rewards/rejected": 0.03125, "rewards/accuracies": 0.15625, "rewards/margins": -0.00311279296875, "logps/rejected": -552.0, "logps/chosen": -576.0, "logits/rejected": -1.609375, "logits/chosen": -1.625, "nll_loss": 1.453125, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.6552734375, "grad_norm": 0.7674288621413258, "learning_rate": 9.969653386589748e-05, "memory(GiB)": 49.37, "train_speed(iter/s)": 0.024195, "rewards/chosen": 1.4296875, "rewards/rejected": 0.43359375, "rewards/accuracies": 0.699999988079071, "rewards/margins": 0.9921875, "logps/rejected": -488.0, "logps/chosen": -510.0, "logits/rejected": -1.46875, "logits/chosen": -1.7421875, "nll_loss": 0.98046875, "epoch": 0.40404040404040403, "step": 10}, {"loss": 1.51802978515625, "grad_norm": 0.49307127908285064, "learning_rate": 9.847001329696653e-05, "memory(GiB)": 50.82, "train_speed(iter/s)": 0.024958, "rewards/chosen": 6.0625, "rewards/rejected": 3.53125, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 2.546875, "logps/rejected": -488.0, "logps/chosen": -462.0, "logits/rejected": -1.6640625, "logits/chosen": -1.828125, "nll_loss": 1.1875, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.8334716796875, "grad_norm": 0.12751888679721451, "learning_rate": 9.632470336074009e-05, "memory(GiB)": 50.82, "train_speed(iter/s)": 0.025382, "rewards/chosen": 10.0625, "rewards/rejected": 5.875, "rewards/accuracies": 0.9750000238418579, "rewards/margins": 4.21875, "logps/rejected": -450.0, "logps/chosen": -294.0, "logits/rejected": -1.8515625, "logits/chosen": -1.7109375, "nll_loss": 0.7734375, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.55615234375, "eval_runtime": 4.2332, "eval_samples_per_second": 0.945, "eval_steps_per_second": 0.472, "eval_rewards/chosen": 14.25, "eval_rewards/rejected": 6.75, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.4375, "eval_logps/rejected": -218.0, "eval_logps/chosen": -812.0, "eval_logits/rejected": -1.515625, "eval_logits/chosen": -1.625, "eval_nll_loss": 0.6875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.672528076171875, "grad_norm": 0.09053125249136239, "learning_rate": 9.330127018922194e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025025, "rewards/chosen": 12.875, "rewards/rejected": 7.1875, "rewards/accuracies": 1.0, "rewards/margins": 5.6875, "logps/rejected": -438.0, "logps/chosen": -370.0, "logits/rejected": -1.6171875, "logits/chosen": -1.7890625, "nll_loss": 0.609375, "epoch": 1.0404040404040404, "step": 25}, {"loss": 0.57813720703125, "grad_norm": 0.06259364123158316, "learning_rate": 8.945702546981969e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025309, "rewards/chosen": 14.0, "rewards/rejected": 5.59375, "rewards/accuracies": 1.0, "rewards/margins": 8.375, "logps/rejected": -454.0, "logps/chosen": -370.0, "logits/rejected": -1.703125, "logits/chosen": -1.796875, "nll_loss": 0.62109375, "epoch": 1.2424242424242424, "step": 30}, {"loss": 0.57001953125, "grad_norm": 0.06341948959916613, "learning_rate": 8.486484005469977e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025272, "rewards/chosen": 14.5625, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -384.0, "logps/chosen": -398.0, "logits/rejected": -1.59375, "logits/chosen": -1.8359375, "nll_loss": 0.58203125, "epoch": 1.4444444444444444, "step": 35}, {"loss": 0.542498779296875, "grad_norm": 0.05322574754844299, "learning_rate": 7.961176263324901e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025462, "rewards/chosen": 14.5, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -474.0, "logps/chosen": -372.0, "logits/rejected": -1.7578125, "logits/chosen": -1.7109375, "nll_loss": 0.57421875, "epoch": 1.6464646464646466, "step": 40}, {"eval_loss": 0.46728515625, "eval_runtime": 4.518, "eval_samples_per_second": 0.885, "eval_steps_per_second": 0.443, "eval_rewards/chosen": 16.125, "eval_rewards/rejected": 5.5625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -792.0, "eval_logits/rejected": -1.5, "eval_logits/chosen": -1.6875, "eval_nll_loss": 0.55859375, "epoch": 1.6464646464646466, "step": 40}, {"loss": 0.5608245849609375, "grad_norm": 0.06288771891169008, "learning_rate": 7.379736965185368e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025554, "rewards/chosen": 14.6875, "rewards/rejected": 5.09375, "rewards/accuracies": 1.0, "rewards/margins": 9.625, "logps/rejected": -540.0, "logps/chosen": -296.0, "logits/rejected": -1.78125, "logits/chosen": -1.609375, "nll_loss": 0.58984375, "epoch": 1.8484848484848486, "step": 45}, {"loss": 0.57568359375, "grad_norm": 0.05520286247644814, "learning_rate": 6.753187775963773e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025458, "rewards/chosen": 14.875, "rewards/rejected": 5.53125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -418.0, "logps/chosen": -302.0, "logits/rejected": -1.640625, "logits/chosen": -1.6953125, "nll_loss": 0.50390625, "epoch": 2.080808080808081, "step": 50}, {"loss": 0.525238037109375, "grad_norm": 0.03187452398084299, "learning_rate": 6.09340545603188e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.02552, "rewards/chosen": 15.375, "rewards/rejected": 5.59375, "rewards/accuracies": 1.0, "rewards/margins": 9.75, "logps/rejected": -382.0, "logps/chosen": -342.0, "logits/rejected": -1.6171875, "logits/chosen": -1.8515625, "nll_loss": 0.5390625, "epoch": 2.282828282828283, "step": 55}, {"loss": 0.529901123046875, "grad_norm": 0.04272549422103994, "learning_rate": 5.4128967273616625e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025575, "rewards/chosen": 16.25, "rewards/rejected": 5.78125, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -374.0, "logps/chosen": -358.0, "logits/rejected": -1.6171875, "logits/chosen": -1.8046875, "nll_loss": 0.51171875, "epoch": 2.484848484848485, "step": 60}, {"eval_loss": 0.4541015625, "eval_runtime": 4.5254, "eval_samples_per_second": 0.884, "eval_steps_per_second": 0.442, "eval_rewards/chosen": 17.25, "eval_rewards/rejected": 6.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.0, "eval_logps/rejected": -222.0, "eval_logps/chosen": -780.0, "eval_logits/rejected": -1.4140625, "eval_logits/chosen": -1.65625, "eval_nll_loss": 0.5390625, "epoch": 2.484848484848485, "step": 60}, {"loss": 0.537469482421875, "grad_norm": 0.0790840513667992, "learning_rate": 4.7245611982206724e-05, "memory(GiB)": 52.27, "train_speed(iter/s)": 0.025599, "rewards/chosen": 15.625, "rewards/rejected": 5.6875, "rewards/accuracies": 1.0, "rewards/margins": 9.9375, "logps/rejected": -496.0, "logps/chosen": -358.0, "logits/rejected": -1.6953125, "logits/chosen": -1.6796875, "nll_loss": 0.515625, "epoch": 2.686868686868687, "step": 65}, {"loss": 0.49605712890625, "grad_norm": 0.045626680740731757, "learning_rate": 4.0414468403813095e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025673, "rewards/chosen": 14.6875, "rewards/rejected": 5.25, "rewards/accuracies": 1.0, "rewards/margins": 9.4375, "logps/rejected": -532.0, "logps/chosen": -253.0, "logits/rejected": -1.703125, "logits/chosen": -1.546875, "nll_loss": 0.486328125, "epoch": 2.888888888888889, "step": 70}, {"loss": 0.562591552734375, "grad_norm": 0.05430409750149703, "learning_rate": 3.3765026539765834e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025585, "rewards/chosen": 16.625, "rewards/rejected": 6.4375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -408.0, "logps/chosen": -368.0, "logits/rejected": -1.484375, "logits/chosen": -1.78125, "nll_loss": 0.470703125, "epoch": 3.121212121212121, "step": 75}, {"loss": 0.4714996337890625, "grad_norm": 0.04354967811022926, "learning_rate": 2.7423332084455544e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025651, "rewards/chosen": 16.5, "rewards/rejected": 5.9375, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -352.0, "logps/chosen": -346.0, "logits/rejected": -1.5546875, "logits/chosen": -1.7265625, "nll_loss": 0.490234375, "epoch": 3.323232323232323, "step": 80}, {"eval_loss": 0.4521484375, "eval_runtime": 4.3906, "eval_samples_per_second": 0.911, "eval_steps_per_second": 0.456, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.5, "eval_logps/rejected": -223.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.359375, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.5390625, "epoch": 3.323232323232323, "step": 80}, {"loss": 0.50703125, "grad_norm": 0.047973918415174815, "learning_rate": 2.150959712448669e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025587, "rewards/chosen": 17.125, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 11.6875, "logps/rejected": -476.0, "logps/chosen": -400.0, "logits/rejected": -1.6328125, "logits/chosen": -1.6953125, "nll_loss": 0.54296875, "epoch": 3.525252525252525, "step": 85}, {"loss": 0.4710205078125, "grad_norm": 0.043401090316595184, "learning_rate": 1.6135921418712956e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025624, "rewards/chosen": 16.125, "rewards/rejected": 5.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -468.0, "logps/chosen": -302.0, "logits/rejected": -1.5625, "logits/chosen": -1.609375, "nll_loss": 0.46484375, "epoch": 3.7272727272727275, "step": 90}, {"loss": 0.5209747314453125, "grad_norm": 0.04969693397822946, "learning_rate": 1.1404167454183957e-05, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025657, "rewards/chosen": 15.5625, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 10.125, "logps/rejected": -490.0, "logps/chosen": -258.0, "logits/rejected": -1.6953125, "logits/chosen": -1.625, "nll_loss": 0.486328125, "epoch": 3.929292929292929, "step": 95}, {"loss": 0.547564697265625, "grad_norm": 0.04492084211332035, "learning_rate": 7.404029558083653e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025612, "rewards/chosen": 14.9375, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -486.0, "logps/chosen": -250.0, "logits/rejected": -1.765625, "logits/chosen": -1.625, "nll_loss": 0.44140625, "epoch": 4.161616161616162, "step": 100}, {"eval_loss": 0.4482421875, "eval_runtime": 4.1505, "eval_samples_per_second": 0.964, "eval_steps_per_second": 0.482, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.625, "eval_logps/rejected": -224.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.34375, "eval_logits/chosen": -1.6328125, "eval_nll_loss": 0.53515625, "epoch": 4.161616161616162, "step": 100}, {"loss": 0.485595703125, "grad_norm": 0.05652229958262343, "learning_rate": 4.2113336672471245e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025612, "rewards/chosen": 16.0, "rewards/rejected": 5.28125, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -436.0, "logps/chosen": -284.0, "logits/rejected": -1.5859375, "logits/chosen": -1.6328125, "nll_loss": 0.453125, "epoch": 4.363636363636363, "step": 105}, {"loss": 0.53631591796875, "grad_norm": 0.05482775842378465, "learning_rate": 1.8865999845374793e-06, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025642, "rewards/chosen": 17.0, "rewards/rejected": 5.90625, "rewards/accuracies": 1.0, "rewards/margins": 11.0625, "logps/rejected": -592.0, "logps/chosen": -422.0, "logits/rejected": -1.6171875, "logits/chosen": -1.6875, "nll_loss": 0.55859375, "epoch": 4.565656565656566, "step": 110}, {"loss": 0.5370529174804688, "grad_norm": 0.0914572949853764, "learning_rate": 4.738957681248379e-07, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025681, "rewards/chosen": 17.25, "rewards/rejected": 5.875, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -428.0, "logps/chosen": -378.0, "logits/rejected": -1.546875, "logits/chosen": -1.6796875, "nll_loss": 0.5546875, "epoch": 4.767676767676767, "step": 115}, {"loss": 0.45579071044921876, "grad_norm": 0.05182072182744844, "learning_rate": 0.0, "memory(GiB)": 53.72, "train_speed(iter/s)": 0.025675, "rewards/chosen": 17.25, "rewards/rejected": 5.96875, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -528.0, "logps/chosen": -390.0, "logits/rejected": -1.484375, "logits/chosen": -1.5703125, "nll_loss": 0.482421875, "epoch": 4.96969696969697, "step": 120}, {"eval_loss": 0.44873046875, "eval_runtime": 4.0424, "eval_samples_per_second": 0.99, "eval_steps_per_second": 0.495, "eval_rewards/chosen": 17.75, "eval_rewards/rejected": 6.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.75, "eval_logps/rejected": -225.0, "eval_logps/chosen": -776.0, "eval_logits/rejected": -1.3515625, "eval_logits/chosen": -1.640625, "eval_nll_loss": 0.53515625, "epoch": 4.96969696969697, "step": 120}, {"train_runtime": 4670.8786, "train_samples_per_second": 0.424, "train_steps_per_second": 0.026, "total_flos": 64331810078720.0, "train_loss": 0.7084124247233073, "epoch": 4.96969696969697, "step": 120}], "memory": 53.72265625} diff --git a/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs/events.out.tfevents.1737757592.kml-dtmachine-18088-prod.64443.0 b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs/events.out.tfevents.1737757592.kml-dtmachine-18088-prod.64443.0 new file mode 100644 index 0000000000000000000000000000000000000000..870cd221ffa24ab2a6fd060c9cd67b2fd7609a50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_8192_rank8_epoch5_what/v1-20250124-222534/runs/events.out.tfevents.1737757592.kml-dtmachine-18088-prod.64443.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fe7a8270680b705a8ca32bb960001eee189476d293db0297a380b7e90553698 +size 33741 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc896707cd0edf2184941df0476b3f1d9872d7ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952e2c5134936590d6e2548d2a7921b8f4be177f3543d77b8ab9b5d5658bd7b3 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f58d49dc7d958b73cb0025280172e429fab4144 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed1332ea966743e94fcfb9bb5e984779545d6af01ea252c61b58a23a2ccdc18a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe9598a601a26129ba897a2f4fc3f3e3822007e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f7cf3796ec9922988d8e9797de2fbb8224951d0d5010013561e41f735a5fd6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bab8b07e7be7211dacd722d3941d5daf6307e18e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ca62c7dba7171a2d6d94e5367cb143b2374dd1474e54148981167ff83791a6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8c23b9470ba5801f4646e3cf78b169d3e19e4ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5490468c68f8ab781832bd2f0581f58fbe54ac7774920c73a6bfa30e0b34b8aa +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3963751f2be8852f9bcd46a35e2e82f737abb07 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3a719964c9c9cf47a55d44fa8a1a0587abd18372c371c74e295940fe68ac91 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d6788f630e2fd0eb5d3716e19033a2704630b45 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88697fc376989deaf5544adca20936c0c96e7a2b0ff0523975442bcad69ec61a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c6f2fe9b7481b6c6eaa6fadca03e7edeb6d6f39 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dabb5713e563266b48721ec822f5f44684aa544ccb8c260f1ce4d472e4ae525 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0eee8653afa8e2a1243dcbdf43e379bcddb0261 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4af52ede611c70f6376245621b0891483d83ce83122855dc8c277f8896459d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..590f5f9a24593f3ff9d06d041aef221737dd0206 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4cc80537f02f37cf1f81e75069a7940716041598f661e1f5c38d639ab70d501 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93bcd6ebfe133201ff2d0edfd5bf5d9072e1a70c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189b2628efe9b8e8bcd258696b515ec56e79e4a091ce991941e4c2de3f2729ce +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8ad9eb370d777d1a427eb8178fded39c3a56db7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6434d462b37661cd4e32d2c0d4b7673ee44c13c7f5f6a59eba992c9c1c69e274 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a162a06b3fc99250f9d3d102d5af2aafc330c67b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38daf6c27b9716373ec351071e778fdb64eefd3d1e95bb6c6a8162d0f07484a7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a0c2ede703f28e7ac4ca711a4c20959f9c6cbc2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b557dff718a0bd5e198584ec037a4e4aa4cd1a729afc691165e5a820d81093 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23eb9fa080e770a931bfe594532ad035995ebd67 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f57054d3c1a3ffb42857ac3baa055e084f48ecaeff4eac1c245b8c1967f4b302 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32c2064c2161c8ce8e4000304d05b0138ad329d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3679b565ace89aea01f4c60ec78b6ef80ce0f0aba4e046d989bada75931c26f3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62635ed5827e1e6b1ece213e1a7f847343abb2ab --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef333792be0fe6aa800f9126d09f64e73aa2775dd74b41e7e9475c8b09178cdb +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e44eb3fa45a557460f92c098f2fd1eccee781e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e2e2737d63e3807ccb5977a1e9e37b3ec45ffad7bcc727e9e137aebc01368 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a05b36054db4d09dcf6d6c1dd36bb88d731a739 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.42285156, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 289169055416320.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7e7176a1d2fd0663b7d71df3a858d2f0e06b459d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dd09d572018e96e6f29e2985e54c343bce7c777a164cb4f5b86fb1ae578baa7 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f35659d6b15afc526987a194ce53ff604b2ee3fa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78c751b97ede51c019cad3572ab6928f412ad5ed5dc5acd423f8ae1c72eef7ff +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..304aa0b9184e40433628870d233c8e28f8613be7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee7f523e1b99101df2f5d86ee67b1615d96cf72345ecea556a08b966cf3319a9 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ba6f8e57d44a2142d5d181f17b8024d27bfa64c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0920057da8b095fab4951d4e8b538841af7176968bbd3529f44b3f063bf2c2a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fb9ee1d74229cf0b75016cf78de67f3e7fffe14 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:692b920697d5baaf5465b46859d0a13716d81a651976b0244d46785311718697 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..502a03b4e2481f9a536761d7fda9a09e86d567ea --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acdee7263dd0a73ff3aa7a90e6b9b3aa10e716eb1a8c40df4b93e0021c800071 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8660534f18d2780f1498f7d16dd2cccb1197472 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809a164f02596fea43c07fba468729e60ee26a325b94c0193cf8c4564acb1462 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a51c679528fbc5126b86eca2d7e8cfbdd4e858d0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34cda0aebb2b75f16e34220b9767bad4f56a79536c83c670f8e384db3505b4f3 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7b9ddc8d5750f8955c26c1f11de8ae4f1b7b894 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6661a3e3ba75eae3a1d448d09a76e80eacece555552b19a4c90e00f63c708c4d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e16cb0491da7505d2ba2332fecedff9a7579e984 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:100f0886b43b2152911a1ac3327c97b56722b06bc92ada601a8fe57d6cc40854 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..024d5c15c918e6eaa2d0b8af01b3761ea7c20435 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1391a4065acedb60cf55a6bd91044fc7f07cbdb641e2242c2f6d6fc20fdeace +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..400d1ce12574e9ada03b2833fe4174b63ed0b4da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b356be7ba26489c4adeda3fb9a910a6edebf269595e5ab46a6e69e0f71d8241 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1683f7d75254e9fc8704c0bb8761168ce5431fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d3f029357e3f766991cc0ce4b12129cad0a8fb127da99d9a1332c3eddbbc36c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc24d29149a8abe52631246348a03906dbadcafa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a8f78eca0f8d1c1252bed61fc196beedbba7250df4c24f81ee5a817556182ea +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b1e2fc30ee36f621407b54c3d8075f40ac298f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c286ca7c798e541441379635a95fd4a2aea4618eead75c5e34262a2f78c296f3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..300c87ddeb4265c795e925e5a888366dfa9e03e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace06d445c86c3aa8f283e8bb6a25163cba726f391e53a4739273d1d9c93ecf8 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c3ee0ef3dae8e8f83d0dc34dd365dea44f389ca --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:217b4bbeb0f6c27c35be0a992bc5ec3da12663daacdb8393fcb8ae5d8fc63d9e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..0e13e0563ec45a863d519305a1251d3e72b9e3e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/latest @@ -0,0 +1 @@ +global_step120 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056328cf4dbfbdfaf5b7ffa668b29852f77a3798 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b76da7ccfd8d1a286433da6127628e0c6a1565950b2dea51fe5864ad3e6545 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66bf952960dc43442b85e495c7b76634d5b8eac1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 2.4, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 346481921097728.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c9690f3f988895adc04eac35dff7b35c9bf702ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99d1486d67136440cb5c115b54bf080f20e41c0c989f48d8bc6b86702e30565f +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..159e763c71a37cdd46f724e3a50d9a22a69a7477 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17ed67f44dc2fbfd134b78055f57a97d49a97870039c5573d47f6c8f5a7d0a0 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..164f5134f2b702988c507376503816050fce4d53 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53fb02356a9ca0647fffc388f8f7e3d10e37e59f91a205c60eac7a6efcd42c21 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cddbf144e2374aabb465f7f62f5386fd2a1b0be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e550640a278aba3b6dcac1f598aafa2de3de441506979a67a9bc1a1825256230 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d58fe61eb358753b960af1623dd721e25c278a3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a52544aff5c27f42972a0ee0b95031aee1ec7bd227f54f964caca8a00a678893 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0de2e9f74b330cbf606d82c05ebd2f00ef7af39e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba79ed6a002d27c60b8722a34f092e0b71600364e229cd5646c103368969f96 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d5b0726a81f9b58ef4bc7d25ddc558dcc53a46f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538e0ebf1242101561ed7065d2ee8b2d38cd3d2f45ab9523532e6cbbad81a284 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60bf77e02ddd8d0fae043f2b8a0ccd47560ed498 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa48d6d3133a66a5419415332125a4b7595356d6e2a7a7cb9b7e6dd089c7c31 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6932c96722bd74a232ddc5e13098f73c4600c6b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa0834b421a0169069feb324ffae60ca63d1344211a4f11bd0bacea21d920ce4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50d669838e05c4a85ccb73e854384bdcc8f62f2a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba85c8e6fa53109c2a5c97fa2e1b9d8c7c3d84b77567f9500675a855668e185e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dd579d0732140cbcb929f7f0a7cfe4c10c2818f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56b37a0d8d9494259013fa81f51b508e86b9dc84196d71df3738b96f08222d31 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcf5dd610a401a3bc379d0662134b0b8f931916a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b284432aa1d88f0eaf60862a87281947df4723e200485f2dfa01836a7d8d55a3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f441a8dc7614691a07f046bd540fb7c3b412b864 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8be766ca056ab279d159dffe15fc4f3131211e59140623ac3a1abb4dbfba5e8 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..434165eb7eda7e71604d098da406bb2041accba8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dd08ddf0c5df9303b4274c8c2adbfe4c86fcaafeffb16f1af176c7a060d28a +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d6503afb248a341bc30bd2f63309e586b959731 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3684664a3e78e1537481f1067496eac1fba2e89b1ce73bad08e243622de889d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f515c6f0917b5aaed9b22759f281b1deb1d73d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d47f8fb62bb31fab4c711bad01567ee412ceea40b6a38f34bf9729d92ce2cc2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b06c5bb5db0506886581ceee08b5fb7d3c81c180 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a541685035a050fef28ae37acdf050af58cc745e3ba83f2b74af9c33a60b9f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd2b9aef86529798137c2868d556e873a23c785c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/latest @@ -0,0 +1 @@ +global_step140 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6e613ad02e1482b1eef52ff51329fe67d4fceb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9c57c64e42f5d7ec5b6fd8bf14122cd4f49a4ae907dcde9c057b79cc82e639 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56d917dea913d796f3e36e034fe6ba3884b96437 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/trainer_state.json @@ -0,0 +1,674 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 2.8, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 402073554518016.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-140/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3d983390dd9b4d0c4e8f059f871e5ed0901f3d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09935eb0a4406f2535e45a636d7ce7b4fbdaebb8bb7de728a79cdfd5360d4771 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59256e546ed0bb4cebfa619d247036a543ed5995 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8551382b9184eb351277e3458a9b0b607906d10db7dbde165ddb5fae9b0fe6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff306617f8e1cf655d5a65ba64c8648b9de194a5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e922a7b041617f2184a9581eb48e90d3a06bd93f72de565057c542add947b17 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f0737bcb7f371516cc6dce797a264e81591bcac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8d049e79b02d87ebb884e0cb67c5d3840bda090276a06ffb0c2b77f57170fc +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d05251a7fb05506bd68fe4eaea2dcb027bd6cc52 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f69ce1cba30e6142df90ae55265a63325c986aad408856c2ead84c61eb50909 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e8ab4342b36e337394e438f2eb381dc2a7fdd4b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbd42a6c9d969567e84cbe82fd79abd2421d8e8cfd50f29c6ad5cdf2851e9689 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bb7d68050e49110be29eb375d5469b63ff4ce6b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d776aee96dba9518545b0a1af55b011f0601ea745d89c583bc200485b510535 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f38f48eb895d98ec4f0cb3dc1e908e9d354cf005 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cabee43eb98bee4c276f8b5dff40ad516de0f30e70ae5963a511d0fde491f25 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..089fab14fb97ff8e36987b30ea9141cb8cbad638 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b18aa2d70bd8d1032d77a5ded7a073bbb5d00f162fcf69c44a3dc695b4d3611 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92ef6f5d456213e36f8711ede0200ab8000b65ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ece968334e9a4e84cf1c26054607a358898bb7a9aa1e4987673b4c8eadf4fa +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0e105fd2b1b5ac4df22bab0a461339ef1dc0fb8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b736a1290f6de23470d030b1a237677ff66adcd19955adb340161fdeb71f3f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fb22f6a40099c6ad1d123a968c4c0032455f8c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e4e43fefe6ae28ed460816172fa6e2ee6dd3413fb4cab71be3adcb9c3b6658 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06bbeb209cc670e60aba299a61c5ea75582b6f92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:891c7c230f608025b4f6fa942f999287ceb2ef9ea064d66ef00ecb6c11b45138 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b406230a10be851a1eeb3ca7218084b04cf821c0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ed7fbb184af80211122be1643561d40a79ae278157fd8aeeb206018df778bf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6285c2e170f5b0a5f88175afc8992ea9b056765 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:895450fde1fe02b278d751361d1cdc00905179c2734094d931d6fc6be91bdd64 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e994069118d0fefaef50121e1d186aea70688f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476165855ad1567b52b353abe67105987165f50259bd9930d73d6a105b48a783 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3b21d106c1efcafb3c88f3d50510d9ce615d09c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88f628060589f0ab7cd5f96e0aeca6bef8f7d2a7616134e9d990d552dccf763 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/latest new file mode 100644 index 0000000000000000000000000000000000000000..3df30ded267d950ff3ca04cffb9660be12079ca6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/latest @@ -0,0 +1 @@ +global_step160 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e31a2394e12bf431ae13288c3d90fe4727f07fa7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6462d333dbc5bb5e497ea9b0adb960f7616f79e6eea63222de6d5bd559516 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1db0a0f44aa3ac1d82c3bf8dc2d8968eeba4ce7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b045e1bfa728f51c8b51ab0faa20b128a4fbd350da006b9b39a19e24abdf5a74 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..75de18f57a056bd6a5f89df1abd045678f3f919e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76a3d058d2628a61848c2441d313f251278bd8f74ce43dc44d8cd8ad3e619a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fd100693bc9f3267d044ce4a16e702502dc03ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f72fc498e6eaa671cdc0e8a627a668b8ef607063a22ddb4edbc05e791be830 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aeeabfe119f1cb0c8c804f1b9a4d3049f478d69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12889af98e175b734a788f4c5b8c4da91dd61ff3a05aaf61b9d4c66aa3dd8ad6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fe0f42382ab06f4d26d753745a914c9e46100e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe21a86abfceeac2cf2f48afd61a9a506cf61a287f3403f1adf391bb2ffa5a83 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5830ca6bd04645962b6e56a00a91cd8349ca449c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73488bec91f9dee6d8105d06f99edaf4d27b6b064250d4c7023f33285b2f3132 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..343d1c0475f0dc64100dc67b09195e047f1a7bcf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf6ee1cc2e1325b428a21172ec4e61b7220c5489751ea11c06bb66c77a0cd08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..229789af83e72e748f236450e9d2df977318d98a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b659f5e1f39ab526587d47a9d305eeca96cdb1335d25ff0a7b9958f685604b4 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2ce81319ad07591a40bb92292b6411c4ff77f1ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/trainer_state.json @@ -0,0 +1,763 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 3.2, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 464936217346048.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-160/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1e595a0dbfa039505de3c9c15543ad65291c09e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee7d17a2ff26946f96e5c0a0df14d39f43150f67335241257cd9de7a239adbd +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff162cc1a962dd2b95f18bbb6eaa4f05c9524c72 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3000470f7f879ab8c60a784e8ece260ace579306fa3d998571ed5367b7292d22 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca6ef7f9404ad55eab57610ffd678a5d9a530f00 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf8653d48991f9972abbc34ef0f31bf37fcae88e06391b922df96506d16483b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a72470e94fd408b811ec97d1cb439f4732d87ef6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d94e638f6a783480282766119cd8af0a70b4a5afb33fa3570fe310a3cdf471f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5beb29a5d5fbd900c550451b3f0c3e29d0a02682 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af0e4c4ddf3cb03b2dca6c7c011ce317f1e6fd891c7f12c1d84b3ffc21078ad6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..abf643d29f415bc9d0a28fa7499412d89f962401 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc65fd29fdac3dcb042ae4be425839bdf86e4cadd40cd70e4707a2ba72f03280 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d56b243dc59b0f4f22c07dfd07d6f615868402b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8af445c198914eb10abd2ccd6c55233fb73d926f2f00a7be4a51f912a98bd2e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4469a93908caa09cd17088110e2869bde96973b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad08a7b16f3e46420e9f817b6524fde31ec96315d5b51453c7517a241d78e0aa +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3d030e03e054bb88986fe2e60977ffe373c51bc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79b7d7fb9c4e0009a229d3041c0b8c3b22865aad0781d597ebdc00ec84885e6b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce7402793537355fc2871447bc69ed9d94874de1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88da61dd1d2eadde0def04e82fb9c563190b82b4fada435767b043e9cb99e9c2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6861c5213a77f98d7ae04bf7fdf5544f18e38ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d896cf44f5ae5cfb0b0672aaa5d2f2bc7dc398e600579d7a563c1e12e5e8f868 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dbf57ca5528fae0d7dd4f887a42386490c74d3a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b99a62d11ca772c7128224795581b8e3dede5719839171afa4be80583c1b9963 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d381b635aadfa9d274f70f301ab5999d14d7361d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7805212658b287efc8164e7a8ff9ca1f75379dae7f732429753e67bdc995ba +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd0250e69ecb6ad2b478b783a98ae56a51eefb31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58f8c8180c503b0557b6daa834a7c7acb3fb718c2e9073e433664bf7d023d50f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61c3eb94cb68cdb8e5713e23b28481bac46e91ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c23ca0fb63e716d16fbe0b04909afe92b09d6293237ee53a7d4973a388fd20 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7730a09d64d14b9556a34e9555e1a2eb3445d9d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f56560f4e984835705279e3f77f7ff5b3859b4dc81de22eb106b902641eef4f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eececf14bb3a4e77c4c7cdcd716ea2c68a56c1e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b1e2b044fbcaa3c03b8be6d0c38ea38881719091425c350d96e9db2b9e7a5d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/latest new file mode 100644 index 0000000000000000000000000000000000000000..eac7d625396c2750025575c77b8da5d622b0c7dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/latest @@ -0,0 +1 @@ +global_step180 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f51b498d48145bd9cc14b35f8236b9ec95a4f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08e59ac81067b262a084604cd3392250166c2841 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20a24c17b4be2ee59cd5e6682010519318a91e58 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..54050f6cf8fb847e2a926e14a7aad2647761521a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..263aae475c49b090bce43f143308192c5bf9a95b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..942ed5d60ae87dce686b33da76a34db404036dc6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..57789be3df3983cb8acc1500bf6470ffadb1c578 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32d6e2e7eb7148713b473b0c821a98e616ab6e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18942cfbbbc36710e196a20b862a745c9dcc2468 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fa6cf7ac608af8ab72180ce60dcfa61b0bf4eeab8e185f70f65a95b45e6b7a +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3e34a98309b8841f383bd5037e1abba8283935c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/trainer_state.json @@ -0,0 +1,852 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 3.6, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.0577057175486542, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.6953125, + "logps/chosen": -132.0, + "logps/rejected": -418.0, + "loss": 0.33818817138671875, + "memory(GiB)": 61.51, + "nll_loss": 0.138671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 14.4375, + "rewards/rejected": -2.078125, + "step": 165, + "train_speed(iter/s)": 0.113429 + }, + { + "epoch": 3.4, + "grad_norm": 0.38519303074023925, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -498.0, + "logps/rejected": -159.0, + "loss": 0.3404090881347656, + "memory(GiB)": 61.51, + "nll_loss": 0.341796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 20.625, + "rewards/rejected": 1.3125, + "step": 170, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 3.5, + "grad_norm": 0.5718561646417449, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.6640625, + "logps/chosen": -302.0, + "logps/rejected": -436.0, + "loss": 0.3107784271240234, + "memory(GiB)": 61.51, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.8125, + "rewards/rejected": 1.3359375, + "step": 175, + "train_speed(iter/s)": 0.113326 + }, + { + "epoch": 3.6, + "grad_norm": 0.30064476351594455, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -418.0, + "logps/rejected": -247.0, + "loss": 0.39559040069580076, + "memory(GiB)": 61.51, + "nll_loss": 0.494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.375, + "rewards/margins": 18.0, + "rewards/rejected": 0.3203125, + "step": 180, + "train_speed(iter/s)": 0.113857 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.5, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44775390625, + "eval_nll_loss": 0.4140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.203125, + "eval_runtime": 3.7431, + "eval_samples_per_second": 1.069, + "eval_steps_per_second": 0.267, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 523088368500736.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-180/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5423ee083186bafa82eb6e60e35823c5b448dbe7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a863fca3a4674c9b17e93136ec32a3dbcb50280ed31432478a6b7618e089b252 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bab45a301574736f73442f3c729da65a6cc896ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ace2e2fd9a075886ec2725d6ca032794ee6583295a3565c6dc2c0733a018e3 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..671e37fc23db0e35ed48dde7a92af2f7f9c9df6b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c9db0e741f50b3cea1635c4a7538e715f179c151c6bb5d425126df92b7658f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8687b3283e35580609a817d7d347399c5d508998 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa920120f142e12802732ab78d23b2de65cc5c728693312f690c98ecfdd7623f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df37358f15b5bbb35437c8dd723c274bea288d10 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0b4fca769e4c5b5bd74babd8c7b759ff02929720c99e1e796ae14eb9e83d73a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0435d24b75836e6d37030c0735ebf5b872ed748 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0731bdf96d18ebd2e3aab40d450e1c1ce1b2d78491e814f2643bb6968bd7d324 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ef497fc1d308d3a5a1f8c2ecfe007d9e7de249f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7630e9e38e3859f73373486f23f8b7fc187a2c2c21d1a804bb3824ac206d41a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..217306b3110a1ddfdff835fed1527a039879c6e7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cb6126693af2525fc6ad8190e952b8b4beb7c107f60524e40f6a264e1f2f47d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa6d142e32750cb9bc28d177c363889df0260b9e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8332da73d3115b369e20aeb09dfe3ecb123d1e2c576535f6794bc94c3f3b389 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71e6809710083fe9e4f43f4f24320ab837b4b2d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc32c30258d2e509d189a7fb90875c5854417eae2fc2c69faf195175c37bb12 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df4cf8107b9a471b5634be66dbaf87b0ddccc0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52500ee3ca3c11c5b445ab093cf8153c3fff876beee85b0abd10261d911602c7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c74002e4e7e77b8d2379cd0fc0051218a501eef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8782f491d6fb02bb2e55278718c1f43545f2b0854ea7d67dbdf8b2e329462c14 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..362fa577140554954bdf284c91ce9280351ab4b7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae71a5ab33f00bdcd484910f31edda017814c9cafade2221d327f59b4a096e70 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e48629d032c86a46eff6dc97fa0d157e6ea400c8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d3824c22ef06bba7b85ad2be38b5269f76c72eee929e9ffca7d0475f0bdd3f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5914584e64f6b0b05de05207b1abfb5ae081c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c75fa988ef5168c28fc1e2d5f6aa773f62a6d4ee47cb09d5b6c0edb3e009325 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9e00fcbe7c3e8396f4b9c17f08b813c8edb64ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb900f2674600b021e706038cca71fb2f23fa4ef4bdcea1e79d46e83150d85be +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21854f7478406ebcd42308f8d76481466e204436 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46405c8ff8635574e3d845efc734f33cf5a7ac4e72e9ec8de87bb6995bf2970 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2729ff9a97436d6c8ad743637f529065140ad3f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e1cf73eea4791075e839e628da180bf39e1e01fcc1630f4ac9c723d8793968 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8b89c585f5b1ac9628189130903ce8174a34b7fd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.54589844, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20", + "epoch": 0.4, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 58975981928448.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44139d2177b20ce6063806ac5260df31fb6fb2be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131186b6842807806acd4a6227b068fe45c0a86ad4aec3b79fbf0ffc76e5f0ad +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d988dce1c2f6218b1dba00aa782a6703bcd5a1e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46abd630238c56eaf9327a70a63bc32d8b1676a53cc52d41da69631591f6465c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e640e969be1688ef5412c7168540d8c874c2b9d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc47c602441618dae4b6850caa09c126f985d8545b02b143c8082c1bd00094a8 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df3f963acd955d1ba458a7d57736a6607f56e00c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a20214c6087ea4549ae2e361fef2ed150ff7b1c6f0c06d265d61303495c45398 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ec0e54ce7cb140ddcadc2bc680fd62248163fc4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520a95a31e05ed47873823201b5ff1d377ab4cb13493374d776ed6abe57676bc +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec655f0e26f1821e4ce7d38d504bdbc66ac6787d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:604a99e80dd621229f06b603e0d456eb69ff369a6fba672a0da0b4cb4ff5eda8 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b58d7551a6cf938bd16f2a5ef22869ff4e58848 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe244d922edaa798b9d9f1e3a8017a9e88b223234965c3c000053b1d381e891d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f0bfb540558428c27aa2764aab5599bdc8179b4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761a9feb136f0557e1062da3464015dd5c117c376811d51d880dd827e94fa9ef +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1131b4885f89a0473df5fc0e6929638fc030e49a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e147f5aab367a9b4841ad561d035026c15e84246a473c8192f8ee346282efa89 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bde23f44b716ef5fcc6d6390ebdf0edc5b15b92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e33cde7b16859b5088f4ffd3845d8b48453bc84e09f7506ebffb9234610fe5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6bf47b04a0b89fed5ad1c50e85a4ff37b487d8f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd5cc584e21418e22cbbbd2d7ec48f6ec048c64002e0ee900b9af47af8dbe84 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93dbf664b416a8bd76825356c697a24be80adb0c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb0b97c861f9c1b765a07fc612a643ae9b975529c3e6a58fbbff5e56c8029d5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0271d603f4f65ede5f2293541eb3ec66210a36c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b064ada0f9c945ba8307572a411ca15e51a66bd356854dc8d164870a90556196 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..349488e8fb77bbbff083f02acfb8f80e05cc3b1b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7237dee3b29ceb881f3bea76e22e054294c64423b2ce152496587f546b4e3642 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59ad21e07ae6b3d93dbade2be79e362268e455d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dafaf936472fd312519629ae86377e05b4e8366a3fc1ae46ab7357f088e5b8d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d1a3db634dbcb4cce6a7bff3aa030847e0d39f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b62b423d79abaec10d71574dd8918809f6362cff22fc46e6cccda16ddf4a4ea +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5acb5f25acad0b7402adf58b560cf3083e3fc002 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f88406a16b5da116748d1b2e4b370def2120e4e9c689a12a815d2f60c4eb6d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/latest new file mode 100644 index 0000000000000000000000000000000000000000..753e24e10f3a2489150f458205cf759fd8b6081f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..584f4a4a43f100f35696d7314a633631af587f25 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891ffa7c7dae99113aa986d67278b52b8c57db55001dc3547a61f24569a34ee +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..05b027a867e5e9cebd446293ecff82cfb240cc76 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b92875cb04deec367605433847d1bda444b178b643d2da7ed9aaf738d232b4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..af98f0dfe2a5d89fbccf90df58246a0b078c7016 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f5f3338a05e325b5408a1cd0b6f5e5b10fad05fe479d63f44bec4cf18107d6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..715aa4a4ee3915f810fc2bacb2153eb8a0913781 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be749fea477a3867d44010631937e0d8f071ca5f9614f9795c92c7fa68833a6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bde70899833455b6ee4a99aff9388abc5ffe92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc4a5ea4532c621f4c8e9891117b2e597a7f005001e8b4f2a1b4da8c82bf964 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..90cdeaa2fe438098e9d95ddbc06c765e51af1e78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480f9fe7dd71b54d915b46162e34b780ba2467d5542115cc809dbca60b394c0e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd30529614c5be239cd9477af6bef0e313740b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11d982dcd813e82c2d97a5491ce9624cff2dd22e8655ea617ccef1fc1474470 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bed311094effd49cc2c89237c675f56eade157d1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73494fac3a001cba7cedd097b97f028d4c1d136ee6709214b0a7fe305e5b9089 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b08896e3e64039017a0606b43a6327f1f78848dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826281cb7f404c3805b9798147d05074dd208eac748e2052087055a015aaeaed +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..411f005caf8abd057c1302b15766940461b3bf6f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/trainer_state.json @@ -0,0 +1,941 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 4.0, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.0577057175486542, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.6953125, + "logps/chosen": -132.0, + "logps/rejected": -418.0, + "loss": 0.33818817138671875, + "memory(GiB)": 61.51, + "nll_loss": 0.138671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 14.4375, + "rewards/rejected": -2.078125, + "step": 165, + "train_speed(iter/s)": 0.113429 + }, + { + "epoch": 3.4, + "grad_norm": 0.38519303074023925, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -498.0, + "logps/rejected": -159.0, + "loss": 0.3404090881347656, + "memory(GiB)": 61.51, + "nll_loss": 0.341796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 20.625, + "rewards/rejected": 1.3125, + "step": 170, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 3.5, + "grad_norm": 0.5718561646417449, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.6640625, + "logps/chosen": -302.0, + "logps/rejected": -436.0, + "loss": 0.3107784271240234, + "memory(GiB)": 61.51, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.8125, + "rewards/rejected": 1.3359375, + "step": 175, + "train_speed(iter/s)": 0.113326 + }, + { + "epoch": 3.6, + "grad_norm": 0.30064476351594455, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -418.0, + "logps/rejected": -247.0, + "loss": 0.39559040069580076, + "memory(GiB)": 61.51, + "nll_loss": 0.494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.375, + "rewards/margins": 18.0, + "rewards/rejected": 0.3203125, + "step": 180, + "train_speed(iter/s)": 0.113857 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.5, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44775390625, + "eval_nll_loss": 0.4140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.203125, + "eval_runtime": 3.7431, + "eval_samples_per_second": 1.069, + "eval_steps_per_second": 0.267, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.25784447177312775, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.9140625, + "logps/chosen": -412.0, + "logps/rejected": -197.0, + "loss": 0.3451987266540527, + "memory(GiB)": 61.51, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.875, + "rewards/rejected": 0.62109375, + "step": 185, + "train_speed(iter/s)": 0.11379 + }, + { + "epoch": 3.8, + "grad_norm": 0.3269795021594465, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -1.984375, + "logits/rejected": -1.859375, + "logps/chosen": -239.0, + "logps/rejected": -560.0, + "loss": 0.3305183410644531, + "memory(GiB)": 61.51, + "nll_loss": 0.478515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 17.0, + "rewards/rejected": -0.5, + "step": 190, + "train_speed(iter/s)": 0.114169 + }, + { + "epoch": 3.9, + "grad_norm": 0.46923090007407836, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.7421875, + "logps/chosen": -346.0, + "logps/rejected": -524.0, + "loss": 0.28177928924560547, + "memory(GiB)": 61.51, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 19.625, + "rewards/rejected": -0.9609375, + "step": 195, + "train_speed(iter/s)": 0.113877 + }, + { + "epoch": 4.0, + "grad_norm": 0.5973510030321513, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5234375, + "logps/chosen": -270.0, + "logps/rejected": -524.0, + "loss": 0.32286620140075684, + "memory(GiB)": 73.34, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 16.5, + "rewards/rejected": -0.19921875, + "step": 200, + "train_speed(iter/s)": 0.113643 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.75, + "eval_logps/rejected": -231.0, + "eval_loss": 0.44970703125, + "eval_nll_loss": 0.423828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.6961, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 580410816069632.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1d1f023f25c78f07cadc1a858ba524086a92ca16 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c79c4b14f4cc95b569e09e0609dc3f8b94a7ee989135fa5216347a7790d2d70 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c72cb658bccdbfbe3892944a3f91bcb8be64049c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0144ff848a0223dc22a7c2c1bb831f28b0c3a29b8917809c976ccb509d5d46ee +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5826cdc5ee0b8e03ee393680452e0aead507b710 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76771f010bb1bb66a42fd7823b271f83864c416ce4a6d076676841a69ab93763 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0c5197ee1c8cbf32dbd56b64d4f750bc8956f0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d7931ded7623867ccd2c5bf7addcc5ab846150aa06e12b59addfd16e3aad573 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16a111a5873ac9ebe3585c8ddfdc38fcb44b78c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebbf0918ad2c79b5779e789912cfe34ca567a75a73e51652a63b3d4335d06d0f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..467275ff3c035ca0af0b33d63d2364d81250994f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6150abbdda809c1ea0ec43eed852ac9daf0d51ef399b18dca7c86b30daf250e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cba5a91089f15b8aefb4b677231c28c87604d4f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af9b5979404d8fc8a62178bab640b4d8808dee2d8381f174f5bdc0d4482c340 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bf2c5f6812afe05c809623fdb54ac43673aa2c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:652db9fb52c5313de9db4d63aff2ec5e02cdf9c4feb84b39646eec34e7334729 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5fb1b73e7c51580c321e1104bfd23af08975a30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c31bbf332eade334b49fecc3ede1e35ea95b5fcfbd00c724681c23f31396f0f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5008545763215fdc50fc0d1d379597b5f24a8a2b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ad77c70c15202358a7ff7d6b1c13984b94df7e3ec953a112ddafbccd9bfd379 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..282e3dbb0373140b1da8c69eb2924ea0c53d2a43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f6f57fe76c52e8a35f234dca157c04b0e6d8729e2e355fc4df4e99ead34219 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92b737c9ba1961d1ac3fc17aadde348b5c8cc150 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59f16d65205d88957fca642dab82620d0cd32d7dfe4f8a2fc492cd387b27ce5e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c846ee202b383799ddb094c8701176c7a7dfc1e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe544dcab6c6a2cd20ff66c6d3181b5cbc88c105912866caedf7ae63ceb25729 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f197adb10a942a6c4de27fd6e8179d90d72820c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989016ff43e8eacc0926929f1690a0e292594a415551b8e6dfb4411a29d58ea6 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcb7eb0c9cea38504f9b7e203762cf56be3fac33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43b75f19ce28b0f91a783892b7cdb25d45491bbde542f96c472bb9d87d0e958 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43230214ac689a3045c81514337bb9de9af9561b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd96cd520bb0f23cca4152fa516968cf862567839f0059c1de73e4645f4f987 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71920eeac9666981c979f5d10ff4177cbb1bda5c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:866d74d3d2053941d63405d021855a3f8faf84868e471f06c8f0b3363b88374e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9ebe2709e7f014a6431e10a08b9ee83756b9b83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/latest @@ -0,0 +1 @@ +global_step220 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc26f1e85f4e8e85881b70bb37705b907a71e2da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192b6eaac6b92a2de7d039b2fc8b1f373bff6953e1e6a952189b56167078edd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bbb17aaed064e29c0c9cd539bbc5d966a567a068 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/trainer_state.json @@ -0,0 +1,1030 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 4.4, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.0577057175486542, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.6953125, + "logps/chosen": -132.0, + "logps/rejected": -418.0, + "loss": 0.33818817138671875, + "memory(GiB)": 61.51, + "nll_loss": 0.138671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 14.4375, + "rewards/rejected": -2.078125, + "step": 165, + "train_speed(iter/s)": 0.113429 + }, + { + "epoch": 3.4, + "grad_norm": 0.38519303074023925, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -498.0, + "logps/rejected": -159.0, + "loss": 0.3404090881347656, + "memory(GiB)": 61.51, + "nll_loss": 0.341796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 20.625, + "rewards/rejected": 1.3125, + "step": 170, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 3.5, + "grad_norm": 0.5718561646417449, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.6640625, + "logps/chosen": -302.0, + "logps/rejected": -436.0, + "loss": 0.3107784271240234, + "memory(GiB)": 61.51, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.8125, + "rewards/rejected": 1.3359375, + "step": 175, + "train_speed(iter/s)": 0.113326 + }, + { + "epoch": 3.6, + "grad_norm": 0.30064476351594455, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -418.0, + "logps/rejected": -247.0, + "loss": 0.39559040069580076, + "memory(GiB)": 61.51, + "nll_loss": 0.494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.375, + "rewards/margins": 18.0, + "rewards/rejected": 0.3203125, + "step": 180, + "train_speed(iter/s)": 0.113857 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.5, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44775390625, + "eval_nll_loss": 0.4140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.203125, + "eval_runtime": 3.7431, + "eval_samples_per_second": 1.069, + "eval_steps_per_second": 0.267, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.25784447177312775, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.9140625, + "logps/chosen": -412.0, + "logps/rejected": -197.0, + "loss": 0.3451987266540527, + "memory(GiB)": 61.51, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.875, + "rewards/rejected": 0.62109375, + "step": 185, + "train_speed(iter/s)": 0.11379 + }, + { + "epoch": 3.8, + "grad_norm": 0.3269795021594465, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -1.984375, + "logits/rejected": -1.859375, + "logps/chosen": -239.0, + "logps/rejected": -560.0, + "loss": 0.3305183410644531, + "memory(GiB)": 61.51, + "nll_loss": 0.478515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 17.0, + "rewards/rejected": -0.5, + "step": 190, + "train_speed(iter/s)": 0.114169 + }, + { + "epoch": 3.9, + "grad_norm": 0.46923090007407836, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.7421875, + "logps/chosen": -346.0, + "logps/rejected": -524.0, + "loss": 0.28177928924560547, + "memory(GiB)": 61.51, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 19.625, + "rewards/rejected": -0.9609375, + "step": 195, + "train_speed(iter/s)": 0.113877 + }, + { + "epoch": 4.0, + "grad_norm": 0.5973510030321513, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5234375, + "logps/chosen": -270.0, + "logps/rejected": -524.0, + "loss": 0.32286620140075684, + "memory(GiB)": 73.34, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 16.5, + "rewards/rejected": -0.19921875, + "step": 200, + "train_speed(iter/s)": 0.113643 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.75, + "eval_logps/rejected": -231.0, + "eval_loss": 0.44970703125, + "eval_nll_loss": 0.423828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.6961, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3215452292580796, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -318.0, + "logps/rejected": -564.0, + "loss": 0.3552096366882324, + "memory(GiB)": 73.34, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 15.3125, + "rewards/rejected": 0.890625, + "step": 205, + "train_speed(iter/s)": 0.113 + }, + { + "epoch": 4.2, + "grad_norm": 0.4025934432388528, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -2.0, + "logps/chosen": -248.0, + "logps/rejected": -215.0, + "loss": 0.2990260124206543, + "memory(GiB)": 73.34, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.5, + "rewards/margins": 17.5, + "rewards/rejected": 0.9375, + "step": 210, + "train_speed(iter/s)": 0.11323 + }, + { + "epoch": 4.3, + "grad_norm": 0.4259000697068638, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5078125, + "logps/chosen": -143.0, + "logps/rejected": -896.0, + "loss": 0.31557292938232423, + "memory(GiB)": 73.34, + "nll_loss": 0.1923828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 15.0, + "rewards/rejected": -1.359375, + "step": 215, + "train_speed(iter/s)": 0.113298 + }, + { + "epoch": 4.4, + "grad_norm": 0.22959233205564938, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.5, + "logps/chosen": -78.5, + "logps/rejected": -800.0, + "loss": 0.2897603511810303, + "memory(GiB)": 73.34, + "nll_loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 14.5625, + "rewards/rejected": -2.640625, + "step": 220, + "train_speed(iter/s)": 0.113169 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.75, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.125, + "eval_logps/rejected": -231.0, + "eval_loss": 0.452880859375, + "eval_nll_loss": 0.439453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.7921, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 0.264, + "step": 220 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 641014053470208.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-220/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c565105af9c60348ea70b6a26dd0cb100bc0d337 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f079bfe423cc1164c464e6fd6dd922bc3931c5be7e8725a252628cd7c6d842a +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06d85666776aa7a8607839a853317a49b3df79e2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc2b7cf5baac4d158e74d1deee3fb69c2e143798822acb7f2f7bdc0d55cca1f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e9ab275c21b50143766f327f1fe98b955ba278b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd943ffc1581d87485e32d93246aeb421f43d68add92884556ed6f001f5d170 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5861dad285dda6dd199945d9145cb599de0e6a0f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3661f594bc36872c6ebd2a5fbe1772fac4842cf85e3515165cc437fb6ae989 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b6c8c0508a5461a377939294a6e78e2d2a871be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf86f172aa04e4e98511a3f7aac40582c241060e5b186f3ca91d461bf8202f8 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcf9f2da4986930a30e99f8c01afadb66cbed30c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be15d5d1d8f00b5b6a7e68131b11bc9ce6fadf0a3ba037a0a0a970bbb701293f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba66bc4ec6ec0f629594d21e45fdf05a172e6a73 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43cad7756d964ba72f93cfd018404458a00ac23fa8928d70fa61acbc525dfac +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca65c42fcc48ce85a14e5de63a6dcbabafe02f5f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c81fae2c9dee61c88d932e7459473991fa68f7648729cb1f8b451018122d27cd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a37e084f7673f4721ee4e5247657af8002874b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d32a84f9d56cdb981423a316563afa9a6b19d4c77135020e6084b6b5e3204b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7206233e292811c1fb8903f3a2d8770a9b770e79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02604cded14cd2d79957c721d05827970ba405895a73fe52e20900e6ed29ef33 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..905a4fca3ed9c3b213a7572dda6b448b2d5fa7cf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e535dab82d1041cedb024955c508d5ae4309dea10118d364308e96b4b3be15 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3e2773c79471caacdd8feb060b17dc30e450c1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db22f5469e1617c402491fd8895eb803b15fb5f1c081edebb47b1e3e36280c95 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec9e7b45781caf41ba4ce0245bc2df6efdf30cfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0866ffb98c3cf04d78b0b6810beb088a8bbdf6fb5f9fd749ba3f6eebd7e25a09 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4706b5e75c91f1487b9a1cd764b35797ffa72f08 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4573283ab9c550879810ceaa24b2af427002a5344fafffad523593738c6f3699 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37b0caddbf1da86d62ff0150ff7df116cb3fb272 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6577905b359d657064d8502259bb64d9c14c8c36fdb2462a8e23cd491e5079bc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9038f23f62ee384c72ca4adcd28f87704ddd8a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcf3531a040f9a558f8ed1527d3d9f6a0aa255cac84a4cf3bbfe0d77a7bf6564 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dac544c45d5dd20ea5ad34f8e0c30dfc7531450 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b5a6aefd5b4b41357dbc7622b7284276bdd6dd5b67712ea49602271d8fe9c9 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/latest new file mode 100644 index 0000000000000000000000000000000000000000..161e63cf7292b2184098d115f0621d2ed09e86c5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/latest @@ -0,0 +1 @@ +global_step240 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3a6ea45dd4e59b9683f66476f460fa0c77a9d66 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0c9979566a5d89cb3c766336548670ec6f2291deba1b7ab1764c12d3187b24 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42e6b0d6985c9b3f0cec701759e0b3d671c77abd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e36a570d6158fc25d1cf5d9f8f450fc64c5a7683330277f89ff76d5f2fc6cd +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..376994a32199299a2a48b62753947cdb1f7ad72a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f619cbef4b74f1680d667c8788285a602392e63bdf3760ef3a59ec8864d483 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f1edb2dfec55e5cbead7ae3d14351c3650c4f77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc037fba93ace1bf7ce01b1a5f7d785698d47b4cc2cedf2300bbf7a41ebf05c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..016d34db4ec6597c207021d026234c9692c3f3ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab728c2461d6d1c64f04d7cbfdfcbfa7bd7ad0ef6e19d52458501ee81b27128 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d7824c2bd9e8b1cec7f0d84d673017b0da62e43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27530e653ebf5997ae3159cdcde264607e6a6f86b7e3c7a1b3a1e8301cd43d03 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f41ee261ad98d2d0eb8f09167a5b32604513b56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fddaeb1257697bd7c0101abf1ab23f2925d0d9165cd8bddfbd22f8444db2b7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8816834cc1c0e822e11a8df138fa41557f3a0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942af3734a320fe12a3205a47ca1cdc7d1f0996bfde86c020a35545ccd2fd418 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce5faf9896aeadd65d47acddb4b510a6fc3c65f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a46b33bfe1e26ebea81904070b93f8e7376ae49add370042b1998521eed8ba +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..57970657166d3b4e551c7fd49e689f1811dc3638 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/trainer_state.json @@ -0,0 +1,1119 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 4.8, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.0577057175486542, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.6953125, + "logps/chosen": -132.0, + "logps/rejected": -418.0, + "loss": 0.33818817138671875, + "memory(GiB)": 61.51, + "nll_loss": 0.138671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 14.4375, + "rewards/rejected": -2.078125, + "step": 165, + "train_speed(iter/s)": 0.113429 + }, + { + "epoch": 3.4, + "grad_norm": 0.38519303074023925, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -498.0, + "logps/rejected": -159.0, + "loss": 0.3404090881347656, + "memory(GiB)": 61.51, + "nll_loss": 0.341796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 20.625, + "rewards/rejected": 1.3125, + "step": 170, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 3.5, + "grad_norm": 0.5718561646417449, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.6640625, + "logps/chosen": -302.0, + "logps/rejected": -436.0, + "loss": 0.3107784271240234, + "memory(GiB)": 61.51, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.8125, + "rewards/rejected": 1.3359375, + "step": 175, + "train_speed(iter/s)": 0.113326 + }, + { + "epoch": 3.6, + "grad_norm": 0.30064476351594455, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -418.0, + "logps/rejected": -247.0, + "loss": 0.39559040069580076, + "memory(GiB)": 61.51, + "nll_loss": 0.494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.375, + "rewards/margins": 18.0, + "rewards/rejected": 0.3203125, + "step": 180, + "train_speed(iter/s)": 0.113857 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.5, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44775390625, + "eval_nll_loss": 0.4140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.203125, + "eval_runtime": 3.7431, + "eval_samples_per_second": 1.069, + "eval_steps_per_second": 0.267, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.25784447177312775, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.9140625, + "logps/chosen": -412.0, + "logps/rejected": -197.0, + "loss": 0.3451987266540527, + "memory(GiB)": 61.51, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.875, + "rewards/rejected": 0.62109375, + "step": 185, + "train_speed(iter/s)": 0.11379 + }, + { + "epoch": 3.8, + "grad_norm": 0.3269795021594465, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -1.984375, + "logits/rejected": -1.859375, + "logps/chosen": -239.0, + "logps/rejected": -560.0, + "loss": 0.3305183410644531, + "memory(GiB)": 61.51, + "nll_loss": 0.478515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 17.0, + "rewards/rejected": -0.5, + "step": 190, + "train_speed(iter/s)": 0.114169 + }, + { + "epoch": 3.9, + "grad_norm": 0.46923090007407836, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.7421875, + "logps/chosen": -346.0, + "logps/rejected": -524.0, + "loss": 0.28177928924560547, + "memory(GiB)": 61.51, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 19.625, + "rewards/rejected": -0.9609375, + "step": 195, + "train_speed(iter/s)": 0.113877 + }, + { + "epoch": 4.0, + "grad_norm": 0.5973510030321513, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5234375, + "logps/chosen": -270.0, + "logps/rejected": -524.0, + "loss": 0.32286620140075684, + "memory(GiB)": 73.34, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 16.5, + "rewards/rejected": -0.19921875, + "step": 200, + "train_speed(iter/s)": 0.113643 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.75, + "eval_logps/rejected": -231.0, + "eval_loss": 0.44970703125, + "eval_nll_loss": 0.423828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.6961, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3215452292580796, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -318.0, + "logps/rejected": -564.0, + "loss": 0.3552096366882324, + "memory(GiB)": 73.34, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 15.3125, + "rewards/rejected": 0.890625, + "step": 205, + "train_speed(iter/s)": 0.113 + }, + { + "epoch": 4.2, + "grad_norm": 0.4025934432388528, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -2.0, + "logps/chosen": -248.0, + "logps/rejected": -215.0, + "loss": 0.2990260124206543, + "memory(GiB)": 73.34, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.5, + "rewards/margins": 17.5, + "rewards/rejected": 0.9375, + "step": 210, + "train_speed(iter/s)": 0.11323 + }, + { + "epoch": 4.3, + "grad_norm": 0.4259000697068638, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5078125, + "logps/chosen": -143.0, + "logps/rejected": -896.0, + "loss": 0.31557292938232423, + "memory(GiB)": 73.34, + "nll_loss": 0.1923828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 15.0, + "rewards/rejected": -1.359375, + "step": 215, + "train_speed(iter/s)": 0.113298 + }, + { + "epoch": 4.4, + "grad_norm": 0.22959233205564938, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.5, + "logps/chosen": -78.5, + "logps/rejected": -800.0, + "loss": 0.2897603511810303, + "memory(GiB)": 73.34, + "nll_loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 14.5625, + "rewards/rejected": -2.640625, + "step": 220, + "train_speed(iter/s)": 0.113169 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.75, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.125, + "eval_logps/rejected": -231.0, + "eval_loss": 0.452880859375, + "eval_nll_loss": 0.439453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.7921, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 0.264, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.4254404063804997, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.9296875, + "logps/chosen": -524.0, + "logps/rejected": -220.0, + "loss": 0.334822416305542, + "memory(GiB)": 73.34, + "nll_loss": 0.5546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.875, + "rewards/margins": 23.125, + "rewards/rejected": 0.87890625, + "step": 225, + "train_speed(iter/s)": 0.112881 + }, + { + "epoch": 4.6, + "grad_norm": 0.5983738617042668, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": -1.71875, + "logits/rejected": -1.8046875, + "logps/chosen": -247.0, + "logps/rejected": -684.0, + "loss": 0.2966593265533447, + "memory(GiB)": 73.34, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 17.75, + "rewards/rejected": -0.330078125, + "step": 230, + "train_speed(iter/s)": 0.112845 + }, + { + "epoch": 4.7, + "grad_norm": 0.4714305885244571, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -1.7265625, + "logits/rejected": -1.6796875, + "logps/chosen": -184.0, + "logps/rejected": -608.0, + "loss": 0.2641402244567871, + "memory(GiB)": 73.34, + "nll_loss": 0.2392578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 15.4375, + "rewards/rejected": -3.078125, + "step": 235, + "train_speed(iter/s)": 0.112889 + }, + { + "epoch": 4.8, + "grad_norm": 0.3902228714814237, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -2.046875, + "logits/rejected": -2.046875, + "logps/chosen": -374.0, + "logps/rejected": -608.0, + "loss": 0.31013832092285154, + "memory(GiB)": 73.34, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 18.875, + "rewards/rejected": 0.55078125, + "step": 240, + "train_speed(iter/s)": 0.11322 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.7421875, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.5, + "eval_logps/rejected": -233.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.455078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": -2.5, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 697911758553088.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-240/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..42d7e2346f64275a1de8559b045040bbcb8fef91 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:040c6cad160298add30fa570fedb28148361a3fd4d169043ebc09098be657f37 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6f0743ac84e4553504b15f4eb4e32c40dd7670 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:198067efc33c226bed7d8c1072fa1796810997c31baf75cafce4bc810324a077 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e15fd36ebc5f3dd5d298aeb4664d05ce8f0e97c0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43284ba19ca9572eda5c1cd7c1ea1376af0c5a6b44814e5fa6af7689a8a6b1eb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4fdbc9e0890a2aab307e84a1a475ec2056993d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1090fcc647ec253c6722d493ac688d55fbe080aab2f794d9b7cb032efc2538dc +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..303aa2edd0d5f83463de637588d883c9672b5a6a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190b241ce8da0073462bdd19226625bcd2535873c4ddcd81ecbb201eb1429724 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6a72bda3ba513e3d71fbd6be847de85b4f59f3c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2df206d1538a549aba809fe5284236aee588644ca44ce45c1529718608468b95 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a27137b537b65c673c268bbc25ac88b54e9216d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be5e47f1ef913a212ebfd378eb4510538cc4dc9057d2e75bd54b4f86680ed2c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5b800a921df4ecad0aec8d183453b695ebbfc77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fcf712cca8355ec69c00cde79f26a51e36ce7eea90afcf7fe1ea5089ee03c0e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07476530fd19f11ed6d818396aeeabbbc83c6452 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9ee9b361de64c15c2a3c2ef944f1f3f042adc45b21bf3f0f43518b03f5164c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74c72595344e47d125eee40af57b1f2649b56f4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:402309bdc3ad5853e48f3ef675a9d40d3a415e4fd0142893c49a69df6f055314 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e82ec0902114f99a626f493c97be4bed74a33a2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d14e0cdecbd47f30ce23c03790cb72e556beeb8e8743e760ac502b0b5a88fc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86dbbf0c0e1448a36ca86dabe1c03c3c73d80299 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab93cfa6f04c0f56ec75d10f12cc26a3afd6807507abf85bf7836417cf8e5eb5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24840728efea46dd4f1b9d2890d1d6418c54d5c9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce4ffd58d54fef5848a0455eea663fd72b371741f5d7ce93df05be02e982ed00 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dd6c3a526c80c184698a2bf3a084d952376d7f0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6da7e85b61a09c3872fe5e2bb7776ff50db9724141b99238b261f5bd8b0b04e3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73f670d36fdb6675b12817e9a25e3d3c4d9f3615 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a2714e02715e4e558224e47b6461252e3a4aafe9c42d9a4c5474424fb44b32b +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..461c06b430d8736579badfe588825138ee056ff4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15062182dd944e2770bbd3661a15f0ad423a617adac5c9400b175e56124b0dcf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9dbb8b98ce6607964fd48afa3b50d981fa031a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a953f07ad1f6d0cbe70ea9a3074c82f719ef703588f08ad405efd7301002fb6 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/latest new file mode 100644 index 0000000000000000000000000000000000000000..87449ff1a854ba4a77ea33fbc24adaed3311d6b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/latest @@ -0,0 +1 @@ +global_step250 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab29abc7c5c196288fd5c119c67c4f655f27d44c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5c4738c31c5c9a38e1f586256d59a0e8e7d02641b9b9af2afdbe078440aeb4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8e0ba47a098b34da66857368b41c80a5d9d796f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d374b3390eb52ec7f6161c06272d4f26cb715692bdf2ad5374287b6de420ca3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7676e48e7dd332be5f46585fc5f824c5791f76ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24111edc5a6a2994166cd410155ee3c630816d0fe21c13808ebd2a2ae45bc9d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..228202ae722c05ed5fafc13eeac33a8a2685cca5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157b21eda1c7f898e519251deed08049767ffba123797289de56343a92ba7380 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a63de21fa3e29782ced5828f8f34fba46bad33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb615552e5845759bc13aa2ae50c0525fbf941fa76ee2e2c20cb9838fe1995 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d487727115f1120e55e91ad9583fb23ff8e34083 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf720fc22147ce563d6f2c2f6f3d916a7e8b7af174b480d072b5c822e992aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90628d8fd79ee2a98fb904251b6d7938f5120b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d055d3b033dc8e6fc2a19aa95162960544ab94a903988874315efe4ed5aa8e13 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e1556a7ec04e7309f4c9130351c880ef6a0626 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e03c685f2e019350bfdd41f006495a18690aacbccd7ffc1f40de827f433eb87 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..871b4a6cbd60ea4b2ef2416f3a46bbe632ddb667 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b1af2ae92a304371e36f6c1b7001f5dafc395be0b17c480957fc7fb58d8cd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc493f11827c29786c85d3ae6279f73d4e1eb93d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/trainer_state.json @@ -0,0 +1,1172 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.4659913406853354, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.84375, + "logits/rejected": -1.78125, + "logps/chosen": -236.0, + "logps/rejected": -636.0, + "loss": 0.5656494140625, + "memory(GiB)": 61.51, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 13.5625, + "rewards/rejected": -1.3671875, + "step": 85, + "train_speed(iter/s)": 0.114522 + }, + { + "epoch": 1.8, + "grad_norm": 0.3257493171354355, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.921875, + "logps/chosen": -236.0, + "logps/rejected": -434.0, + "loss": 0.41357421875, + "memory(GiB)": 61.51, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 13.4375, + "rewards/rejected": -0.291015625, + "step": 90, + "train_speed(iter/s)": 0.113825 + }, + { + "epoch": 1.9, + "grad_norm": 0.2864922683958331, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.8359375, + "logits/rejected": -1.578125, + "logps/chosen": -286.0, + "logps/rejected": -456.0, + "loss": 0.4584716796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.1875, + "rewards/rejected": 0.921875, + "step": 95, + "train_speed(iter/s)": 0.113891 + }, + { + "epoch": 2.0, + "grad_norm": 0.2579590962972889, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.6171875, + "logps/chosen": -221.0, + "logps/rejected": -292.0, + "loss": 0.405670166015625, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0625, + "rewards/margins": 11.4375, + "rewards/rejected": 0.62109375, + "step": 100, + "train_speed(iter/s)": 0.11369 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.859375, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -6.875, + "eval_logps/rejected": -220.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7471, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.280595138331498, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.9453125, + "logits/rejected": -2.0, + "logps/chosen": -360.0, + "logps/rejected": -556.0, + "loss": 0.42579345703125, + "memory(GiB)": 61.51, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 14.25, + "rewards/rejected": 1.28125, + "step": 105, + "train_speed(iter/s)": 0.112746 + }, + { + "epoch": 2.2, + "grad_norm": 0.33433345487081023, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4765625, + "logps/chosen": -330.0, + "logps/rejected": -836.0, + "loss": 0.41505126953125, + "memory(GiB)": 61.51, + "nll_loss": 0.3671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5, + "rewards/margins": 11.875, + "rewards/rejected": 1.640625, + "step": 110, + "train_speed(iter/s)": 0.1128 + }, + { + "epoch": 2.3, + "grad_norm": 0.173944606658398, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7578125, + "logps/chosen": -424.0, + "logps/rejected": -588.0, + "loss": 0.443218994140625, + "memory(GiB)": 61.51, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 13.3125, + "rewards/rejected": 1.484375, + "step": 115, + "train_speed(iter/s)": 0.113012 + }, + { + "epoch": 2.4, + "grad_norm": 0.30679422743634893, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.8046875, + "logits/rejected": -1.84375, + "logps/chosen": -310.0, + "logps/rejected": -292.0, + "loss": 0.3655853271484375, + "memory(GiB)": 61.51, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.625, + "rewards/rejected": 0.490234375, + "step": 120, + "train_speed(iter/s)": 0.113772 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.53125, + "eval_logps/rejected": -226.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 10.5625, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.747, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.320148655196901, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.7109375, + "logps/chosen": -38.25, + "logps/rejected": -944.0, + "loss": 0.3992919921875, + "memory(GiB)": 61.51, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5, + "rewards/margins": 14.3125, + "rewards/rejected": -3.796875, + "step": 125, + "train_speed(iter/s)": 0.113708 + }, + { + "epoch": 2.6, + "grad_norm": 0.4365836857173622, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.9140625, + "logps/chosen": -324.0, + "logps/rejected": -249.0, + "loss": 0.3893707275390625, + "memory(GiB)": 61.51, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 16.75, + "rewards/rejected": 0.51953125, + "step": 130, + "train_speed(iter/s)": 0.114337 + }, + { + "epoch": 2.7, + "grad_norm": 0.26221498737417476, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.765625, + "logps/chosen": -125.0, + "logps/rejected": -524.0, + "loss": 0.35826416015625, + "memory(GiB)": 61.51, + "nll_loss": 0.19140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 12.125, + "rewards/rejected": -1.078125, + "step": 135, + "train_speed(iter/s)": 0.114575 + }, + { + "epoch": 2.8, + "grad_norm": 0.21875233894682816, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.9921875, + "logits/rejected": -1.953125, + "logps/chosen": -288.0, + "logps/rejected": -604.0, + "loss": 0.39818534851074217, + "memory(GiB)": 61.51, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.375, + "rewards/margins": 15.625, + "rewards/rejected": -1.2421875, + "step": 140, + "train_speed(iter/s)": 0.115122 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.828125, + "eval_logits/rejected": -1.59375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -226.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.5, + "eval_rewards/rejected": -1.796875, + "eval_runtime": 3.7658, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.5655093745020108, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.765625, + "logits/rejected": -1.8984375, + "logps/chosen": -478.0, + "logps/rejected": -552.0, + "loss": 0.4264970779418945, + "memory(GiB)": 61.51, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 14.125, + "rewards/rejected": 0.90234375, + "step": 145, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 3.0, + "grad_norm": 0.39937567960730846, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -344.0, + "logps/rejected": -251.0, + "loss": 0.321661376953125, + "memory(GiB)": 61.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 17.75, + "rewards/rejected": 1.71875, + "step": 150, + "train_speed(iter/s)": 0.11347 + }, + { + "epoch": 3.1, + "grad_norm": 0.28789168702712753, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.78125, + "logps/chosen": -446.0, + "logps/rejected": -506.0, + "loss": 0.40140838623046876, + "memory(GiB)": 61.51, + "nll_loss": 0.3828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.375, + "rewards/margins": 15.6875, + "rewards/rejected": 0.6796875, + "step": 155, + "train_speed(iter/s)": 0.113762 + }, + { + "epoch": 3.2, + "grad_norm": 0.28174536188893384, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.90625, + "logits/rejected": -2.171875, + "logps/chosen": -378.0, + "logps/rejected": -322.0, + "loss": 0.3630828857421875, + "memory(GiB)": 61.51, + "nll_loss": 0.359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 16.625, + "rewards/rejected": 1.3515625, + "step": 160, + "train_speed(iter/s)": 0.113646 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.7578125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -8.5625, + "eval_logps/rejected": -229.0, + "eval_loss": 0.437744140625, + "eval_nll_loss": 0.373046875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5625, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.09375, + "eval_runtime": 3.7498, + "eval_samples_per_second": 1.067, + "eval_steps_per_second": 0.267, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.0577057175486542, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.609375, + "logits/rejected": -1.6953125, + "logps/chosen": -132.0, + "logps/rejected": -418.0, + "loss": 0.33818817138671875, + "memory(GiB)": 61.51, + "nll_loss": 0.138671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 14.4375, + "rewards/rejected": -2.078125, + "step": 165, + "train_speed(iter/s)": 0.113429 + }, + { + "epoch": 3.4, + "grad_norm": 0.38519303074023925, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.9296875, + "logps/chosen": -498.0, + "logps/rejected": -159.0, + "loss": 0.3404090881347656, + "memory(GiB)": 61.51, + "nll_loss": 0.341796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 20.625, + "rewards/rejected": 1.3125, + "step": 170, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 3.5, + "grad_norm": 0.5718561646417449, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.6640625, + "logps/chosen": -302.0, + "logps/rejected": -436.0, + "loss": 0.3107784271240234, + "memory(GiB)": 61.51, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.8125, + "rewards/rejected": 1.3359375, + "step": 175, + "train_speed(iter/s)": 0.113326 + }, + { + "epoch": 3.6, + "grad_norm": 0.30064476351594455, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -418.0, + "logps/rejected": -247.0, + "loss": 0.39559040069580076, + "memory(GiB)": 61.51, + "nll_loss": 0.494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.375, + "rewards/margins": 18.0, + "rewards/rejected": 0.3203125, + "step": 180, + "train_speed(iter/s)": 0.113857 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.5, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44775390625, + "eval_nll_loss": 0.4140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.625, + "eval_rewards/rejected": -2.203125, + "eval_runtime": 3.7431, + "eval_samples_per_second": 1.069, + "eval_steps_per_second": 0.267, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.25784447177312775, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.9140625, + "logps/chosen": -412.0, + "logps/rejected": -197.0, + "loss": 0.3451987266540527, + "memory(GiB)": 61.51, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.875, + "rewards/rejected": 0.62109375, + "step": 185, + "train_speed(iter/s)": 0.11379 + }, + { + "epoch": 3.8, + "grad_norm": 0.3269795021594465, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -1.984375, + "logits/rejected": -1.859375, + "logps/chosen": -239.0, + "logps/rejected": -560.0, + "loss": 0.3305183410644531, + "memory(GiB)": 61.51, + "nll_loss": 0.478515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.5, + "rewards/margins": 17.0, + "rewards/rejected": -0.5, + "step": 190, + "train_speed(iter/s)": 0.114169 + }, + { + "epoch": 3.9, + "grad_norm": 0.46923090007407836, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.7421875, + "logps/chosen": -346.0, + "logps/rejected": -524.0, + "loss": 0.28177928924560547, + "memory(GiB)": 61.51, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 19.625, + "rewards/rejected": -0.9609375, + "step": 195, + "train_speed(iter/s)": 0.113877 + }, + { + "epoch": 4.0, + "grad_norm": 0.5973510030321513, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5234375, + "logps/chosen": -270.0, + "logps/rejected": -524.0, + "loss": 0.32286620140075684, + "memory(GiB)": 73.34, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 16.5, + "rewards/rejected": -0.19921875, + "step": 200, + "train_speed(iter/s)": 0.113643 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.78125, + "eval_logits/rejected": -1.6015625, + "eval_logps/chosen": -9.75, + "eval_logps/rejected": -231.0, + "eval_loss": 0.44970703125, + "eval_nll_loss": 0.423828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.6961, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3215452292580796, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -318.0, + "logps/rejected": -564.0, + "loss": 0.3552096366882324, + "memory(GiB)": 73.34, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 15.3125, + "rewards/rejected": 0.890625, + "step": 205, + "train_speed(iter/s)": 0.113 + }, + { + "epoch": 4.2, + "grad_norm": 0.4025934432388528, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -2.0, + "logps/chosen": -248.0, + "logps/rejected": -215.0, + "loss": 0.2990260124206543, + "memory(GiB)": 73.34, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.5, + "rewards/margins": 17.5, + "rewards/rejected": 0.9375, + "step": 210, + "train_speed(iter/s)": 0.11323 + }, + { + "epoch": 4.3, + "grad_norm": 0.4259000697068638, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5078125, + "logps/chosen": -143.0, + "logps/rejected": -896.0, + "loss": 0.31557292938232423, + "memory(GiB)": 73.34, + "nll_loss": 0.1923828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 15.0, + "rewards/rejected": -1.359375, + "step": 215, + "train_speed(iter/s)": 0.113298 + }, + { + "epoch": 4.4, + "grad_norm": 0.22959233205564938, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.5, + "logps/chosen": -78.5, + "logps/rejected": -800.0, + "loss": 0.2897603511810303, + "memory(GiB)": 73.34, + "nll_loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.9375, + "rewards/margins": 14.5625, + "rewards/rejected": -2.640625, + "step": 220, + "train_speed(iter/s)": 0.113169 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.75, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.125, + "eval_logps/rejected": -231.0, + "eval_loss": 0.452880859375, + "eval_nll_loss": 0.439453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.7921, + "eval_samples_per_second": 1.055, + "eval_steps_per_second": 0.264, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.4254404063804997, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.9296875, + "logps/chosen": -524.0, + "logps/rejected": -220.0, + "loss": 0.334822416305542, + "memory(GiB)": 73.34, + "nll_loss": 0.5546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.875, + "rewards/margins": 23.125, + "rewards/rejected": 0.87890625, + "step": 225, + "train_speed(iter/s)": 0.112881 + }, + { + "epoch": 4.6, + "grad_norm": 0.5983738617042668, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": -1.71875, + "logits/rejected": -1.8046875, + "logps/chosen": -247.0, + "logps/rejected": -684.0, + "loss": 0.2966593265533447, + "memory(GiB)": 73.34, + "nll_loss": 0.314453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 17.75, + "rewards/rejected": -0.330078125, + "step": 230, + "train_speed(iter/s)": 0.112845 + }, + { + "epoch": 4.7, + "grad_norm": 0.4714305885244571, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -1.7265625, + "logits/rejected": -1.6796875, + "logps/chosen": -184.0, + "logps/rejected": -608.0, + "loss": 0.2641402244567871, + "memory(GiB)": 73.34, + "nll_loss": 0.2392578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.375, + "rewards/margins": 15.4375, + "rewards/rejected": -3.078125, + "step": 235, + "train_speed(iter/s)": 0.112889 + }, + { + "epoch": 4.8, + "grad_norm": 0.3902228714814237, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -2.046875, + "logits/rejected": -2.046875, + "logps/chosen": -374.0, + "logps/rejected": -608.0, + "loss": 0.31013832092285154, + "memory(GiB)": 73.34, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.5, + "rewards/margins": 18.875, + "rewards/rejected": 0.55078125, + "step": 240, + "train_speed(iter/s)": 0.11322 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.7421875, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.5, + "eval_logps/rejected": -233.0, + "eval_loss": 0.45703125, + "eval_nll_loss": 0.455078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.875, + "eval_rewards/rejected": -2.5, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 240 + }, + { + "epoch": 4.9, + "grad_norm": 0.21408464592337356, + "learning_rate": 1.0978021666005478e-07, + "logits/chosen": -1.6875, + "logits/rejected": -1.9921875, + "logps/chosen": -450.0, + "logps/rejected": -490.0, + "loss": 0.3295879364013672, + "memory(GiB)": 73.34, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.0, + "rewards/margins": 17.375, + "rewards/rejected": 0.66796875, + "step": 245, + "train_speed(iter/s)": 0.113152 + }, + { + "epoch": 5.0, + "grad_norm": 0.1773336882227468, + "learning_rate": 0.0, + "logits/chosen": -1.796875, + "logits/rejected": -1.90625, + "logps/chosen": -332.0, + "logps/rejected": -408.0, + "loss": 0.36420450210571287, + "memory(GiB)": 73.34, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 17.0, + "rewards/rejected": -1.109375, + "step": 250, + "train_speed(iter/s)": 0.113433 + }, + { + "epoch": 5.0, + "eval_logits/chosen": -1.7421875, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -10.3125, + "eval_logps/rejected": -231.0, + "eval_loss": 0.45654296875, + "eval_nll_loss": 0.44921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.296875, + "eval_runtime": 3.7077, + "eval_samples_per_second": 1.079, + "eval_steps_per_second": 0.27, + "step": 250 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 725771410997248.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bcb60c55de8b70eb71afa7de8a4e9baa3749d455 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8af8e927cb2ffcb9a0c11b2a2b7afc3d41ef9da0d6369df7e12aa09fac01752 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8938d12894fa89cf41fe58afcd94896b58d0b90 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8effc3998ea9291580eec083b2768935e445eed677470d13d43ee45ae1402b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fecfe5a1fca05623f773c5f1fd262bb9fd00ea7d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46697693d2fa7240c83040916d4e5a6c6c2da4f23da0a5090d3cfff274c1d23e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b8d47cd23048bec56e7f2f3ffc2b6f0db18f646 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93aa12bda9d0504008fda7b95f70e8559ffca4b1e0a5da5f6e671ec95ca2dda7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d93d0144863d5da13d7f5c7aff8d09ffc8de2bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80205d6bb9b03202c95c5de5afab5bd31bc5a31bab249c3552d808bdd63d66ef +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac21df5bae911699713d6b4121a40b72cd45cb0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c72b35566eca85d453d59c4786a426751db5120d7fd3903f2e93ad1a1804469 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31982e1618d8238f3fd59a94e062e4bb1e03e747 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:688892b954f64b6da799db21c910e3ccceed84a15e38a4ea3d80ec681393f33f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ad5d4b2fa0d33e4851e407619f13ed3ff0a16d7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b0b291b8c6039972095f8a2dac4f9265a5534add3ba0e6a7e77ff6ce367f8a8 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae76aa79c356d4183c3e887706ab6e9f46fa3003 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232d754fe9a6cd55eaa8aa6416a869377eb630b9dde94f0cc3f775244e22019c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afd26448610161980f83389f727fb1c7e12384c9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e085ec99f66d7d0af52d756897f1985a3cd2dd897bef8d94ef9897cca041182d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41d4772e898805acd074c3ccf646bbd353bf8120 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26eedb4add806d5e06a1c23c0a9210eeefa44563ed1eb1abe03cd2950196625e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebfe31c48c46a1583b87a294bc772457442a932e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b8eb769532bfd9131fb7c05d8ab9e4e4dd0216c54f5c1700e3f1cdbb20f30f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea7c658d78f301f142e64b5da169ec48621fbac1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f601b6762c81f1718e65faa2789c8fb0eaa4b4cc66732ba21e98b1f2a8acddf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f76b140f623a51abc2b94d754e79008928a8cc3c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:204a80b0d81f381d03d0f205c902e635cfa7c9bd2e35f27a02856973e596e92f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a894f5d5873fd6f34862665f66ed3ddb4071fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:436daa2d4ffd49559a612ad1ef775beae0fde39b211e73e44dde51e6bffce9cd +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e870bbf579217a0ecf645fc18820fb853a2fb3e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0294c4de7634613150fe1f2dc93cce28ac48833a1cebcc72e7b9cbfd375ddb55 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2bc85cc1cdca8ed7b1330c6ba5d0d1a34494e4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00637484f9a03fafb12f0adecf1553eca331026de857a830400189f8d934bb5a +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..35252259eb09d8de259231f63f19e786e44bc7b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb7d8df6ed170dd98dba8737bc9dd038af61afd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e144a445ffd57fbb5be9b5131f17149bde6c4ff5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10f35268ac2a0cb68abc0b78ba5b150b0f29d78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ef21562e384e0889ec2400e8f84b6b0bc59035 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..126662e5f97dd3f9cd7fb87e70843d3d5532dde3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4e6b27cc99b0fa8e6bbf967892f9304b444d81d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..14e214a61e5311f2b7edf2200ec0365ed2dcc5e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f899c511136dcc56c12c5058062d17686812d1b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab67fe2c6ce9ef0b836ccb0828d5bb1b593055af205b40af49c85139e46a6c8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b772e61994a5aa332efe418d64f07b5cc92f36c9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.44726562, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40", + "epoch": 0.8, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 116512967098368.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b15c1702ed3eacbb398a4d403ff51e1818041c3d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aadbb2f3032821ad0efd48462f2727e2fb49dd36136af1922ee8bb9cc8bd69af +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d025a71476ebc1a27329592cca29d5f54efe259 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c65449efaf6bf0cecfe6ce4b1f2e068081cbdfc2902586f25a042cb6a403c0d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffed155210d8b8baa3c874f244e9be49314a9aa5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f1ee4dedbbcf493ed58ca17fcd936038f78200c38a9230437cc0df54adf974 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58d97bcee83a77e78adc346ced84bfa52801ad58 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd4a870976ba2933d2558a6fb8bebc1ef96895063e8e5952e1246b34663d963 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..770ad20a2bfc289a918044f72903029fde3ff7c2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e93cd8161f30f0cb6ffd5fc48ddb64c93d165a7641b9269f19ef2dbdca90a69 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1410d9f107e5f3051f84af50d092c88e0734dc05 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feeac4d66022c883aa8568270d7bf2d34b263f8289a1a993a444d6e398fc5239 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40799e2c01fda58e94ef0f014bb8a146da7d8a9b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f878204bfe3bfe144bab52176577864ad55926dee108947a86cedd844a1dd1b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ef35a93df1d37c06fe3ea8ee51e6a801a27d081 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bb072486db2482b5aa1308be2d9d2eae46bf2b6c0f72f508543982da50bfe7f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45b5808048135495d221a4ac1e44a0841cce11aa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86dc5571e491d95a9f1ad6029ad268018b6938d293e6e9866d295f9e0486cecb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71ccb80f7801d91fcb938a0dc6bff55147b56933 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:567c1ace8b1e93fd6c783a37c5dfdf7536609142cc71c09bbc2a7b61dc557c08 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08733c1c803590d0a99c7585fd92b2451491e7a8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7fa2fe1e7240deab9eab4e4304f56998d4cb61ebbdaa76b6f67a6ab50beeaed +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95daecebcad586f88bf0821b42cf9f02594da6d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731cb7d6895c1f4c1de4bcbf529e851e944579284bff323e574d3a654304ec14 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41a7ecc9ef32ab9e91ea4a2ef288364e03a07b68 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c458cd3a48a1f4248291a59e85a1b738bc6bd65d0984b80f024cb2b04e6b614c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ee789e691363e4beb416e61a755de8c4a8cb4bb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a8a296f63be3425e0d5b8ae125bc53d146791900e81fe0609d4554bf1ac383 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4a267c1699804ac9fb7c48b25cfc4c95988eab4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a7cff424966f84a2d175976e7a0d500c01562ffc722a6b16d233221d217bf7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bf10104c1ada1ac62c91540c9dc0831239215af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bfc85ed774ad636887efc295f76c0e0566386edd44a158b1bbdf16532e3751 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c069015509ed9f38038788190639130a16dff981 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b678f065ab8621c0af00fd2ed6452fea96665bde9ba4d89ddf8ebf5cd0da5983 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..728c3241a49cbd920d5df86255fc8be4d97c5519 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa3ba485fff4300fd9029c17ba92c92630af852a00df0a0e8d16c233f74cbc8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c6eb31d6246659553662922eddc8b4fd64b7449 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.43310547, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60", + "epoch": 1.2, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 173339265204224.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b48855d970590c4b0ebe1924124100bd26da97d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba5bb31ae085d6f3772d47bc976e0e04642bf680 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75567057f28f5b533fc639c71f1e460b594fb2e99d9608796cd9f639e1df74d0 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..971b257c9b7207c2a06ac2b24bf0a297c2a0d63b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd06c9e4121d6fd254eef59dd7901572c7d5c98f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0492b51218789bddbca24aa8074def1694a2c7292a95437517c3e70009a36774 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40a3844a8186d06df832c9f13b6b570ad3656918 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a24fcc031c88a7711cbad17332c34cd560e91271826c2153587593548f950907 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d96864694ede4b27cafa850711185fa0f1f5375 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70f3b0524e822e85490bf2abf3b194b504b9a7fedc7d22bf14edee3cc3794204 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e24cfc1a31997072f206bd63e1eacfd9893633e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c7fb939485573451b0a416258a5c9349e562be364e7aa6a0f41990fdff976dd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a2a44ccdd00392bef01020c2f12c112bd9747a9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3435b4d7ec08457d0df2f8b6dc5545ee2319239b8410305805c9ae7c42a09432 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1779e1ffbf2f4f0331cd2837e48472d63f69f1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a54fada3c867ba49b1ec095d6ea5190ba5e7c2ac4f80bf7f99fea124125ea49 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b62bffb361ed214a65fa6adadf226cdb563be8f3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f63e93dbecf0a69c447d86e76e09344f7efff3836814fc2fcfeafd61d850709 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c3913eeb337361523cb34137b4ad0cf877ccbd7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b3b9d4c507d775cbc0391f8e37b66c90197f1b6ea6069865d0797ff8c717b5 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88e179a3c6cc94d5d7ab1baee5012bc040caa2a9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5433fa7cb36d9c6a0960218414f18cad807626a60cb0b9c4af04d834611e82b3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9148bcfc4108d3b6ac68b0e24254e458e5acda1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b06cb84466a3d3e275e9dc70fb996bf4148b20ab3063417b2aae735c960c893 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a84a8772126e5af82470c89ade4dab170aa3d7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:009b0728223cb2e931eb99522cb7654987ce18e10def82e2517060c79ada8e1f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b420efb31027b3892404f3342e715a1b6d54a2d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3ce249cd4c096e0cd3653f1f5c5667d11385cb513e0934aa1e6c51ede0fea9 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..882ac38b99a02b7e8548bcb82d0ea9b0c703f2b7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90473a56d60e1f1246de0f3705b97518cd82d450c7f589f25ebc253955805fd2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ee03c817482049be4f004f5987c11c85ef5e571 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c83ee6333ed260743efb6d87eb4dba591eb0915f2e8992bd9fd9bf8926981cc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36bb2642b6e0f1792b785ddcf8e371eb5b6454d3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:007241c4be968ef9b7e0252ee3646fc877d8755cca0334f1b72792ef4a9eea2c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d3bc11afbd50cfb893487c891b1f4f463df63d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c63ce470a652b53ad06820e8e391ede60921dc0942fe3c286f240a5584fcb4 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd2a62da4ca83b3b986d96dbf0eaeb82207ca93 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0628a9017696045a3a29e9eaffc71e9262d855716e773c0c3be760a1fe85bc8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ba5f3aba4388a582cd47f7f9e57cd5879b1cbd2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df342004a4d8e3626bf2a9f689fde7c8bfd6d995e14931f5496eda1f456cb6f2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27b0f7845c2b9530c3e6ed3ce232ff4e86b86122 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02096eb4e8850b91490e80e4a042e2e60f71bd2abc6a269d62c271649cb77d2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfb583fc43c6dd4395671708744cfd18c419970 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c778d3d0e7e3d5665fa0a9ecd92986609c430da08b41611d6c05dc19815a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a8c64b1f15ac655b2be2a42fe61cabe2a877704 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d978dcb0c34e022ee6750e9d86814b8c82e4965d7e07662f35f06eeac12938f3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..262e8187e6caeca12ef3b0aa923b12afd697e03d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e83399aed1d9d173c3e07b2efa8530c956b62b2b68394c2ed0d43bd8bba9d1 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..72f794e31f8d3e0c63972e5076e1ed90c52087ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606ab3ca92e3d20c327c69fdcce7f7e39bec2f2c3538b036088b255f917e3ba4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..244e7fdaa1cef2e82bd4e16afb10f32f68318bcc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276a987dd22c9093fec58921ba19f340a28f18bff635cc01324e09a3c37ac3a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e36a588df493151f57c8f73aa08129a3810c2c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee30cdff92a069fa950619177f737b278c096bc7c83c0e5bdea15a673218022 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ab3030005b210a44558f46a12adb96960ae2ceb1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.42626953, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80", + "epoch": 1.6, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.6390440726440465, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.1806640625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.068257 + }, + { + "epoch": 0.1, + "grad_norm": 4.257369809058118, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1875, + "logits/rejected": -1.375, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.076416015625, + "memory(GiB)": 19.96, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.112369 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963642842176267, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.7734375, + "logps/chosen": -494.0, + "logps/rejected": -474.0, + "loss": 1.82529296875, + "memory(GiB)": 25.78, + "nll_loss": 1.2109375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 9.775161743164062e-05, + "rewards/margins": -0.05908203125, + "rewards/rejected": 0.06005859375, + "step": 10, + "train_speed(iter/s)": 0.111366 + }, + { + "epoch": 0.3, + "grad_norm": 3.3753807300891543, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.359375, + "logps/chosen": -432.0, + "logps/rejected": -900.0, + "loss": 1.5294921875, + "memory(GiB)": 45.61, + "nll_loss": 1.1015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9453125, + "rewards/margins": 1.5859375, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.106324 + }, + { + "epoch": 0.4, + "grad_norm": 0.6730769877578879, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8671875, + "logits/rejected": -1.640625, + "logps/chosen": -312.0, + "logps/rejected": -266.0, + "loss": 1.016650390625, + "memory(GiB)": 45.61, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 5.4375, + "rewards/margins": 0.7265625, + "rewards/rejected": 4.71875, + "step": 20, + "train_speed(iter/s)": 0.108726 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.5546875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -11.375, + "eval_logps/rejected": -151.0, + "eval_loss": 0.5458984375, + "eval_nll_loss": 0.494140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.25, + "eval_rewards/margins": 2.5625, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 3.7447, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.267, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.4362878986584245, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.578125, + "logits/rejected": -1.421875, + "logps/chosen": -255.0, + "logps/rejected": -330.0, + "loss": 0.612744140625, + "memory(GiB)": 45.61, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.25, + "rewards/rejected": 3.84375, + "step": 25, + "train_speed(iter/s)": 0.10794 + }, + { + "epoch": 0.6, + "grad_norm": 0.262862568645726, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.9375, + "logps/chosen": -316.0, + "logps/rejected": -568.0, + "loss": 0.53232421875, + "memory(GiB)": 45.61, + "nll_loss": 0.59765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75, + "rewards/margins": 7.15625, + "rewards/rejected": 1.625, + "step": 30, + "train_speed(iter/s)": 0.111231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5291957986954565, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9609375, + "logps/chosen": -442.0, + "logps/rejected": -234.0, + "loss": 0.537908935546875, + "memory(GiB)": 45.61, + "nll_loss": 0.71484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 10.8125, + "rewards/rejected": 0.41015625, + "step": 35, + "train_speed(iter/s)": 0.114244 + }, + { + "epoch": 0.8, + "grad_norm": 0.7105850587888475, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.859375, + "logps/chosen": -171.0, + "logps/rejected": -320.0, + "loss": 0.45433349609375, + "memory(GiB)": 45.61, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 8.625, + "rewards/rejected": 1.015625, + "step": 40, + "train_speed(iter/s)": 0.112752 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.03125, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -8.0625, + "eval_logps/rejected": -196.0, + "eval_loss": 0.447265625, + "eval_nll_loss": 0.3515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.4375, + "eval_rewards/rejected": 1.203125, + "eval_runtime": 3.7694, + "eval_samples_per_second": 1.061, + "eval_steps_per_second": 0.265, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.37874190112712086, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -2.0625, + "logps/chosen": -382.0, + "logps/rejected": -418.0, + "loss": 0.44189453125, + "memory(GiB)": 49.44, + "nll_loss": 0.44921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.0, + "rewards/rejected": 0.8828125, + "step": 45, + "train_speed(iter/s)": 0.111028 + }, + { + "epoch": 1.0, + "grad_norm": 2.6986237924570067, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.515625, + "logps/chosen": -384.0, + "logps/rejected": -936.0, + "loss": 0.4344970703125, + "memory(GiB)": 50.89, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 9.1875, + "rewards/rejected": 2.046875, + "step": 50, + "train_speed(iter/s)": 0.112328 + }, + { + "epoch": 1.1, + "grad_norm": 0.19932517409606404, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.890625, + "logits/rejected": -1.953125, + "logps/chosen": -318.0, + "logps/rejected": -592.0, + "loss": 0.47237548828125, + "memory(GiB)": 61.51, + "nll_loss": 0.48828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.25, + "rewards/margins": 12.6875, + "rewards/rejected": 1.5703125, + "step": 55, + "train_speed(iter/s)": 0.112694 + }, + { + "epoch": 1.2, + "grad_norm": 0.16754375656348677, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.71875, + "logps/chosen": -180.0, + "logps/rejected": -490.0, + "loss": 0.4276611328125, + "memory(GiB)": 61.51, + "nll_loss": 0.322265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.8125, + "rewards/margins": 12.75, + "rewards/rejected": 0.050048828125, + "step": 60, + "train_speed(iter/s)": 0.113658 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.953125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -7.34375, + "eval_logps/rejected": -214.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.318359375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.25, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7111, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.26807398908522023, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -664.0, + "loss": 0.471875, + "memory(GiB)": 61.51, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.6875, + "rewards/rejected": 0.890625, + "step": 65, + "train_speed(iter/s)": 0.11268 + }, + { + "epoch": 1.4, + "grad_norm": 0.23170654825310766, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.078125, + "logits/rejected": -1.578125, + "logps/chosen": -51.5, + "logps/rejected": -844.0, + "loss": 0.4139404296875, + "memory(GiB)": 61.51, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.25, + "rewards/margins": 7.0625, + "rewards/rejected": 2.15625, + "step": 70, + "train_speed(iter/s)": 0.113965 + }, + { + "epoch": 1.5, + "grad_norm": 0.26967258360728136, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.9375, + "logps/chosen": -370.0, + "logps/rejected": -364.0, + "loss": 0.4178466796875, + "memory(GiB)": 61.51, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.8125, + "rewards/rejected": 0.3671875, + "step": 75, + "train_speed(iter/s)": 0.114333 + }, + { + "epoch": 1.6, + "grad_norm": 0.18507453378934124, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.9375, + "logps/chosen": -446.0, + "logps/rejected": -128.0, + "loss": 0.4442626953125, + "memory(GiB)": 61.51, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.25, + "rewards/rejected": 1.53125, + "step": 80, + "train_speed(iter/s)": 0.115538 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8203125, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -6.90625, + "eval_logps/rejected": -228.0, + "eval_loss": 0.42626953125, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": -2.0, + "eval_runtime": 3.7179, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 230090444374016.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b7dc9a75b47d443da66cd8d50cbdabfb45aaa56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9ef253ea7380282a684091983b0c953005fb074d8f960645bb4d1d06bfa3c +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..b85875aa3b309229131e482a617ad63bfdd7870d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f3aaf7051c66f8645b5cbafbaf93a3be88c15b1e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2198ccae1c2bfbd817b8cad363c84780896b28e0 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..a8c1dcbe7504776f208e77b01e4e75895c6e0b5b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..10172c44ee268e0b2965e7196ed7e46468ca462f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..398e54deba5e44f553470e2c83e73e59471dae7b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd3f332a0a9276590b2d2aad6821619831dd64b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..59e270a589fc8d01828a867e42267012e56b00ff Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..fe739bb96512525c2952fac9083f03090b3ab8ec Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..e8d430219fd9dc324c84cc8f27aeff301d739cfe Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..0b7ca109e0514fadc76d38b19f75649545c2b650 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..06199185721d7620d3fca1c7a86f63d8bfc07ec6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..2326c7db5804b73bf88d17119f423bcc212af134 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..97933611536b8acda691b8294659ee4282ded3e6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..848ea0947df84b1ea5060950f205a241b391d1ac Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..f869fa37203441700940366c27c7d20493a62bae Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..0ec167bc97fdafa8c22d52915311e59a1694fa7b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c410fb4871e979ea592cc8e08c1475f0ce71451c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..d25573a2054c24c035c1a9c80416ecf8ea695923 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..363d29e2e70493b7d93a830f08dd09112299410d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..f535a13f24a3fac707eee4fc5a01fb09311139fe Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..5c0ef38503bcfe242e2cca73faa67cd139ded989 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a35cfe372a755e2d87461d816cbabb3366f84ed8 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..3b694b43673a72a381439155c272cce230c8ba06 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..d5650f48360851963f79edefabbf861ff779e1e8 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..88352df71975ebe5055710358bd194f8668b9f2f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..452241b8866ee7af665c9615aae203da442d03d2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..25bf346784c69ccc5e7a4f50ff8840c5fbba8c1a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..2827a8293bb7a436355c66a6d83727a0fae061ae Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..e1f48ff4359341a6e08b035ff2fbdf63fb2b0386 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a744a4b5741bf51dd57872cf6333c5cbaa5a1bb4 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..791c088fec385c6e21a97ed6d186d90a671fcb5d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..1598f7f64797a47375f3a84b97b519a99f437510 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/logging.jsonl b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36fcd519efeeb08b1edd43b2731af723450b9c6a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/logging.jsonl @@ -0,0 +1,66 @@ +{"loss": 2.18066406, "grad_norm": 4.63904407, "learning_rate": 7.69e-06, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.068257, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -127.0, "logps/chosen": -482.0, "logits/rejected": -1.7265625, "logits/chosen": -1.6328125, "nll_loss": 0.4765625, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "10s", "remaining_time": "42m 17s"} +{"loss": 2.07641602, "grad_norm": 4.25736981, "learning_rate": 3.846e-05, "memory(GiB)": 19.96, "train_speed(iter/s)": 0.112369, "rewards/chosen": 0.02502441, "rewards/rejected": 0.0, "rewards/accuracies": 0.25, "rewards/margins": 0.02502441, "logps/rejected": -208.0, "logps/chosen": -552.0, "logits/rejected": -1.375, "logits/chosen": -1.1875, "nll_loss": 0.87109375, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "40s", "remaining_time": "32m 41s"} +{"loss": 1.82529297, "grad_norm": 3.59636428, "learning_rate": 7.692e-05, "memory(GiB)": 25.78, "train_speed(iter/s)": 0.111366, "rewards/chosen": 9.775e-05, "rewards/rejected": 0.06005859, "rewards/accuracies": 0.40000001, "rewards/margins": -0.05908203, "logps/rejected": -474.0, "logps/chosen": -494.0, "logits/rejected": -1.7734375, "logits/chosen": -1.640625, "nll_loss": 1.2109375, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "1m 25s", "remaining_time": "34m 8s"} +{"loss": 1.52949219, "grad_norm": 3.37538073, "learning_rate": 9.998e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.106324, "rewards/chosen": 1.9453125, "rewards/rejected": 0.359375, "rewards/accuracies": 1.0, "rewards/margins": 1.5859375, "logps/rejected": -900.0, "logps/chosen": -432.0, "logits/rejected": -1.359375, "logits/chosen": -1.5390625, "nll_loss": 1.1015625, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "2m 16s", "remaining_time": "35m 40s"} +{"loss": 1.01665039, "grad_norm": 0.67307699, "learning_rate": 9.978e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.108726, "rewards/chosen": 5.4375, "rewards/rejected": 4.71875, "rewards/accuracies": 0.80000001, "rewards/margins": 0.7265625, "logps/rejected": -266.0, "logps/chosen": -312.0, "logits/rejected": -1.640625, "logits/chosen": -1.8671875, "nll_loss": 0.48242188, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "2m 59s", "remaining_time": "34m 24s"} +{"eval_loss": 0.54589844, "eval_runtime": 3.7447, "eval_samples_per_second": 1.068, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.25, "eval_rewards/rejected": 5.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 2.5625, "eval_logps/rejected": -151.0, "eval_logps/chosen": -11.375, "eval_logits/rejected": -1.65625, "eval_logits/chosen": -1.5546875, "eval_nll_loss": 0.49414062, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "3m 3s", "remaining_time": "35m 7s"} +{"loss": 0.61274414, "grad_norm": 0.4362879, "learning_rate": 9.937e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.10794, "rewards/chosen": 9.125, "rewards/rejected": 3.84375, "rewards/accuracies": 1.0, "rewards/margins": 5.25, "logps/rejected": -330.0, "logps/chosen": -255.0, "logits/rejected": -1.421875, "logits/chosen": -1.578125, "nll_loss": 0.39648438, "epoch": 0.5, "global_step/max_steps": "25/250", "percentage": "10.00%", "elapsed_time": "3m 47s", "remaining_time": "34m 4s"} +{"loss": 0.53232422, "grad_norm": 0.26286257, "learning_rate": 9.874e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.111231, "rewards/chosen": 8.75, "rewards/rejected": 1.625, "rewards/accuracies": 1.0, "rewards/margins": 7.15625, "logps/rejected": -568.0, "logps/chosen": -316.0, "logits/rejected": -1.9375, "logits/chosen": -1.9140625, "nll_loss": 0.59765625, "epoch": 0.6, "global_step/max_steps": "30/250", "percentage": "12.00%", "elapsed_time": "4m 25s", "remaining_time": "32m 25s"} +{"loss": 0.53790894, "grad_norm": 0.5291958, "learning_rate": 9.789e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.114244, "rewards/chosen": 11.25, "rewards/rejected": 0.41015625, "rewards/accuracies": 1.0, "rewards/margins": 10.8125, "logps/rejected": -234.0, "logps/chosen": -442.0, "logits/rejected": -1.9609375, "logits/chosen": -1.7890625, "nll_loss": 0.71484375, "epoch": 0.7, "global_step/max_steps": "35/250", "percentage": "14.00%", "elapsed_time": "5m 1s", "remaining_time": "30m 54s"} +{"loss": 0.4543335, "grad_norm": 0.71058506, "learning_rate": 9.683e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.112752, "rewards/chosen": 9.625, "rewards/rejected": 1.015625, "rewards/accuracies": 1.0, "rewards/margins": 8.625, "logps/rejected": -320.0, "logps/chosen": -171.0, "logits/rejected": -1.859375, "logits/chosen": -2.046875, "nll_loss": 0.5078125, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "5m 50s", "remaining_time": "30m 39s"} +{"eval_loss": 0.44726562, "eval_runtime": 3.7694, "eval_samples_per_second": 1.061, "eval_steps_per_second": 0.265, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": 1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.4375, "eval_logps/rejected": -196.0, "eval_logps/chosen": -8.0625, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -2.03125, "eval_nll_loss": 0.3515625, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "5m 54s", "remaining_time": "30m 58s"} +{"loss": 0.44189453, "grad_norm": 0.3787419, "learning_rate": 9.557e-05, "memory(GiB)": 49.44, "train_speed(iter/s)": 0.111028, "rewards/chosen": 15.875, "rewards/rejected": 0.8828125, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -418.0, "logps/chosen": -382.0, "logits/rejected": -2.0625, "logits/chosen": -1.9375, "nll_loss": 0.44921875, "epoch": 0.9, "global_step/max_steps": "45/250", "percentage": "18.00%", "elapsed_time": "6m 40s", "remaining_time": "30m 26s"} +{"loss": 0.43449707, "grad_norm": 2.69862379, "learning_rate": 9.411e-05, "memory(GiB)": 50.89, "train_speed(iter/s)": 0.112328, "rewards/chosen": 11.25, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 9.1875, "logps/rejected": -936.0, "logps/chosen": -384.0, "logits/rejected": -1.515625, "logits/chosen": -1.7109375, "nll_loss": 0.390625, "epoch": 1.0, "global_step/max_steps": "50/250", "percentage": "20.00%", "elapsed_time": "7m 20s", "remaining_time": "29m 22s"} +{"loss": 0.47237549, "grad_norm": 0.19932517, "learning_rate": 9.245e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.112694, "rewards/chosen": 14.25, "rewards/rejected": 1.5703125, "rewards/accuracies": 1.0, "rewards/margins": 12.6875, "logps/rejected": -592.0, "logps/chosen": -318.0, "logits/rejected": -1.953125, "logits/chosen": -1.890625, "nll_loss": 0.48828125, "epoch": 1.1, "global_step/max_steps": "55/250", "percentage": "22.00%", "elapsed_time": "8m 3s", "remaining_time": "28m 34s"} +{"loss": 0.42766113, "grad_norm": 0.16754376, "learning_rate": 9.061e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113658, "rewards/chosen": 12.8125, "rewards/rejected": 0.05004883, "rewards/accuracies": 1.0, "rewards/margins": 12.75, "logps/rejected": -490.0, "logps/chosen": -180.0, "logits/rejected": -1.71875, "logits/chosen": -1.9140625, "nll_loss": 0.32226562, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "8m 43s", "remaining_time": "27m 37s"} +{"eval_loss": 0.43310547, "eval_runtime": 3.7111, "eval_samples_per_second": 1.078, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.25, "eval_logps/rejected": -214.0, "eval_logps/chosen": -7.34375, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.953125, "eval_nll_loss": 0.31835938, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "8m 47s", "remaining_time": "27m 49s"} +{"loss": 0.471875, "grad_norm": 0.26807399, "learning_rate": 8.858e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11268, "rewards/chosen": 11.5625, "rewards/rejected": 0.890625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -664.0, "logps/chosen": -258.0, "logits/rejected": -1.6015625, "logits/chosen": -1.7734375, "nll_loss": 0.48242188, "epoch": 1.3, "global_step/max_steps": "65/250", "percentage": "26.00%", "elapsed_time": "9m 32s", "remaining_time": "27m 9s"} +{"loss": 0.41394043, "grad_norm": 0.23170655, "learning_rate": 8.639e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113965, "rewards/chosen": 9.25, "rewards/rejected": 2.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -844.0, "logps/chosen": -51.5, "logits/rejected": -1.578125, "logits/chosen": -2.078125, "nll_loss": 0.515625, "epoch": 1.4, "global_step/max_steps": "70/250", "percentage": "28.00%", "elapsed_time": "10m 9s", "remaining_time": "26m 7s"} +{"loss": 0.41784668, "grad_norm": 0.26967258, "learning_rate": 8.404e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114333, "rewards/chosen": 14.1875, "rewards/rejected": 0.3671875, "rewards/accuracies": 1.0, "rewards/margins": 13.8125, "logps/rejected": -364.0, "logps/chosen": -370.0, "logits/rejected": -1.9375, "logits/chosen": -1.796875, "nll_loss": 0.37890625, "epoch": 1.5, "global_step/max_steps": "75/250", "percentage": "30.00%", "elapsed_time": "10m 51s", "remaining_time": "25m 20s"} +{"loss": 0.4442627, "grad_norm": 0.18507453, "learning_rate": 8.154e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.115538, "rewards/chosen": 16.75, "rewards/rejected": 1.53125, "rewards/accuracies": 1.0, "rewards/margins": 15.25, "logps/rejected": -128.0, "logps/chosen": -446.0, "logits/rejected": -1.9375, "logits/chosen": -1.6875, "nll_loss": 0.53515625, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "11m 27s", "remaining_time": "24m 21s"} +{"eval_loss": 0.42626953, "eval_runtime": 3.7179, "eval_samples_per_second": 1.076, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -228.0, "eval_logps/chosen": -6.90625, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8203125, "eval_nll_loss": 0.30078125, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "11m 31s", "remaining_time": "24m 29s"} +{"loss": 0.56564941, "grad_norm": 0.46599134, "learning_rate": 7.89e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114522, "rewards/chosen": 12.1875, "rewards/rejected": -1.3671875, "rewards/accuracies": 1.0, "rewards/margins": 13.5625, "logps/rejected": -636.0, "logps/chosen": -236.0, "logits/rejected": -1.78125, "logits/chosen": -1.84375, "nll_loss": 0.39453125, "epoch": 1.7, "global_step/max_steps": "85/250", "percentage": "34.00%", "elapsed_time": "12m 17s", "remaining_time": "23m 52s"} +{"loss": 0.41357422, "grad_norm": 0.32574932, "learning_rate": 7.614e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113825, "rewards/chosen": 13.125, "rewards/rejected": -0.29101562, "rewards/accuracies": 1.0, "rewards/margins": 13.4375, "logps/rejected": -434.0, "logps/chosen": -236.0, "logits/rejected": -1.921875, "logits/chosen": -1.765625, "nll_loss": 0.43945312, "epoch": 1.8, "global_step/max_steps": "90/250", "percentage": "36.00%", "elapsed_time": "13m 6s", "remaining_time": "23m 17s"} +{"loss": 0.45847168, "grad_norm": 0.28649227, "learning_rate": 7.326e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113891, "rewards/chosen": 15.0625, "rewards/rejected": 0.921875, "rewards/accuracies": 1.0, "rewards/margins": 14.1875, "logps/rejected": -456.0, "logps/chosen": -286.0, "logits/rejected": -1.578125, "logits/chosen": -1.8359375, "nll_loss": 0.37890625, "epoch": 1.9, "global_step/max_steps": "95/250", "percentage": "38.00%", "elapsed_time": "13m 49s", "remaining_time": "22m 33s"} +{"loss": 0.40567017, "grad_norm": 0.2579591, "learning_rate": 7.028e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11369, "rewards/chosen": 12.0625, "rewards/rejected": 0.62109375, "rewards/accuracies": 1.0, "rewards/margins": 11.4375, "logps/rejected": -292.0, "logps/chosen": -221.0, "logits/rejected": -1.6171875, "logits/chosen": -1.828125, "nll_loss": 0.359375, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "14m 35s", "remaining_time": "21m 52s"} +{"eval_loss": 0.42285156, "eval_runtime": 3.7471, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.875, "eval_logps/rejected": -220.0, "eval_logps/chosen": -6.875, "eval_logits/rejected": -1.5703125, "eval_logits/chosen": -1.859375, "eval_nll_loss": 0.29882812, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "14m 38s", "remaining_time": "21m 58s"} +{"loss": 0.42579346, "grad_norm": 0.28059514, "learning_rate": 6.72e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.112746, "rewards/chosen": 15.5, "rewards/rejected": 1.28125, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -556.0, "logps/chosen": -360.0, "logits/rejected": -2.0, "logits/chosen": -1.9453125, "nll_loss": 0.5390625, "epoch": 2.1, "global_step/max_steps": "105/250", "percentage": "42.00%", "elapsed_time": "15m 26s", "remaining_time": "21m 19s"} +{"loss": 0.41505127, "grad_norm": 0.33433345, "learning_rate": 6.406e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.1128, "rewards/chosen": 13.5, "rewards/rejected": 1.640625, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -836.0, "logps/chosen": -330.0, "logits/rejected": -1.4765625, "logits/chosen": -1.5390625, "nll_loss": 0.3671875, "epoch": 2.2, "global_step/max_steps": "110/250", "percentage": "44.00%", "elapsed_time": "16m 10s", "remaining_time": "20m 35s"} +{"loss": 0.44321899, "grad_norm": 0.17394461, "learning_rate": 6.085e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113012, "rewards/chosen": 14.8125, "rewards/rejected": 1.484375, "rewards/accuracies": 1.0, "rewards/margins": 13.3125, "logps/rejected": -588.0, "logps/chosen": -424.0, "logits/rejected": -1.7578125, "logits/chosen": -1.703125, "nll_loss": 0.41015625, "epoch": 2.3, "global_step/max_steps": "115/250", "percentage": "46.00%", "elapsed_time": "16m 53s", "remaining_time": "19m 49s"} +{"loss": 0.36558533, "grad_norm": 0.30679423, "learning_rate": 5.759e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113772, "rewards/chosen": 13.125, "rewards/rejected": 0.49023438, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -292.0, "logps/chosen": -310.0, "logits/rejected": -1.84375, "logits/chosen": -1.8046875, "nll_loss": 0.31445312, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "17m 30s", "remaining_time": "18m 57s"} +{"eval_loss": 0.41748047, "eval_runtime": 3.747, "eval_samples_per_second": 1.068, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.75, "eval_rewards/rejected": -1.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5625, "eval_logps/rejected": -226.0, "eval_logps/chosen": -6.53125, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8125, "eval_nll_loss": 0.28515625, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "17m 34s", "remaining_time": "19m 1s"} +{"loss": 0.39929199, "grad_norm": 0.32014866, "learning_rate": 5.43e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113708, "rewards/chosen": 10.5, "rewards/rejected": -3.796875, "rewards/accuracies": 1.0, "rewards/margins": 14.3125, "logps/rejected": -944.0, "logps/chosen": -38.25, "logits/rejected": -1.7109375, "logits/chosen": -2.15625, "nll_loss": 0.4609375, "epoch": 2.5, "global_step/max_steps": "125/250", "percentage": "50.00%", "elapsed_time": "18m 14s", "remaining_time": "18m 14s"} +{"loss": 0.38937073, "grad_norm": 0.43658369, "learning_rate": 5.099e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114337, "rewards/chosen": 17.25, "rewards/rejected": 0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 16.75, "logps/rejected": -249.0, "logps/chosen": -324.0, "logits/rejected": -1.9140625, "logits/chosen": -1.8671875, "nll_loss": 0.41210938, "epoch": 2.6, "global_step/max_steps": "130/250", "percentage": "52.00%", "elapsed_time": "18m 52s", "remaining_time": "17m 25s"} +{"loss": 0.35826416, "grad_norm": 0.26221499, "learning_rate": 4.768e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114575, "rewards/chosen": 11.0625, "rewards/rejected": -1.078125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -524.0, "logps/chosen": -125.0, "logits/rejected": -1.765625, "logits/chosen": -1.8671875, "nll_loss": 0.19140625, "epoch": 2.7, "global_step/max_steps": "135/250", "percentage": "54.00%", "elapsed_time": "19m 33s", "remaining_time": "16m 39s"} +{"loss": 0.39818535, "grad_norm": 0.21875234, "learning_rate": 4.438e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.115122, "rewards/chosen": 14.375, "rewards/rejected": -1.2421875, "rewards/accuracies": 1.0, "rewards/margins": 15.625, "logps/rejected": -604.0, "logps/chosen": -288.0, "logits/rejected": -1.953125, "logits/chosen": -1.9921875, "nll_loss": 0.546875, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "20m 11s", "remaining_time": "15m 52s"} +{"eval_loss": 0.42333984, "eval_runtime": 3.7658, "eval_samples_per_second": 1.062, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5, "eval_logps/rejected": -226.0, "eval_logps/chosen": -7.1875, "eval_logits/rejected": -1.59375, "eval_logits/chosen": -1.828125, "eval_nll_loss": 0.3125, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "20m 15s", "remaining_time": "15m 54s"} +{"loss": 0.42649708, "grad_norm": 0.56550937, "learning_rate": 4.11e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113855, "rewards/chosen": 15.0625, "rewards/rejected": 0.90234375, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -552.0, "logps/chosen": -478.0, "logits/rejected": -1.8984375, "logits/chosen": -1.765625, "nll_loss": 0.55078125, "epoch": 2.9, "global_step/max_steps": "145/250", "percentage": "58.00%", "elapsed_time": "21m 9s", "remaining_time": "15m 18s"} +{"loss": 0.32166138, "grad_norm": 0.39937568, "learning_rate": 3.786e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11347, "rewards/chosen": 19.5, "rewards/rejected": 1.71875, "rewards/accuracies": 1.0, "rewards/margins": 17.75, "logps/rejected": -251.0, "logps/chosen": -344.0, "logits/rejected": -1.9296875, "logits/chosen": -1.5390625, "nll_loss": 0.32617188, "epoch": 3.0, "global_step/max_steps": "150/250", "percentage": "60.00%", "elapsed_time": "21m 57s", "remaining_time": "14m 38s"} +{"loss": 0.40140839, "grad_norm": 0.28789169, "learning_rate": 3.468e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113762, "rewards/chosen": 16.375, "rewards/rejected": 0.6796875, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -506.0, "logps/chosen": -446.0, "logits/rejected": -1.78125, "logits/chosen": -1.625, "nll_loss": 0.3828125, "epoch": 3.1, "global_step/max_steps": "155/250", "percentage": "62.00%", "elapsed_time": "22m 38s", "remaining_time": "13m 52s"} +{"loss": 0.36308289, "grad_norm": 0.28174536, "learning_rate": 3.156e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113646, "rewards/chosen": 18.0, "rewards/rejected": 1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 16.625, "logps/rejected": -322.0, "logps/chosen": -378.0, "logits/rejected": -2.171875, "logits/chosen": -1.90625, "nll_loss": 0.359375, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "23m 23s", "remaining_time": "13m 9s"} +{"eval_loss": 0.43774414, "eval_runtime": 3.7498, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.5625, "eval_rewards/rejected": -2.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.625, "eval_logps/rejected": -229.0, "eval_logps/chosen": -8.5625, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.7578125, "eval_nll_loss": 0.37304688, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "23m 27s", "remaining_time": "13m 11s"} +{"loss": 0.33818817, "grad_norm": 1.05770572, "learning_rate": 2.852e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113429, "rewards/chosen": 12.375, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 14.4375, "logps/rejected": -418.0, "logps/chosen": -132.0, "logits/rejected": -1.6953125, "logits/chosen": -1.609375, "nll_loss": 0.13867188, "epoch": 3.3, "global_step/max_steps": "165/250", "percentage": "66.00%", "elapsed_time": "24m 10s", "remaining_time": "12m 27s"} +{"loss": 0.34040909, "grad_norm": 0.38519303, "learning_rate": 2.558e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113152, "rewards/chosen": 22.0, "rewards/rejected": 1.3125, "rewards/accuracies": 1.0, "rewards/margins": 20.625, "logps/rejected": -159.0, "logps/chosen": -498.0, "logits/rejected": -1.9296875, "logits/chosen": -1.5390625, "nll_loss": 0.34179688, "epoch": 3.4, "global_step/max_steps": "170/250", "percentage": "68.00%", "elapsed_time": "24m 57s", "remaining_time": "11m 44s"} +{"loss": 0.31077843, "grad_norm": 0.57185616, "learning_rate": 2.274e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113326, "rewards/chosen": 16.125, "rewards/rejected": 1.3359375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -436.0, "logps/chosen": -302.0, "logits/rejected": -1.6640625, "logits/chosen": -1.6015625, "nll_loss": 0.43359375, "epoch": 3.5, "global_step/max_steps": "175/250", "percentage": "70.00%", "elapsed_time": "25m 39s", "remaining_time": "10m 59s"} +{"loss": 0.3955904, "grad_norm": 0.30064476, "learning_rate": 2.002e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113857, "rewards/chosen": 18.375, "rewards/rejected": 0.3203125, "rewards/accuracies": 1.0, "rewards/margins": 18.0, "logps/rejected": -247.0, "logps/chosen": -418.0, "logits/rejected": -2.03125, "logits/chosen": -1.6875, "nll_loss": 0.49414062, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "26m 16s", "remaining_time": "10m 13s"} +{"eval_loss": 0.44775391, "eval_runtime": 3.7431, "eval_samples_per_second": 1.069, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -2.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.5, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.78125, "eval_nll_loss": 0.4140625, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "26m 20s", "remaining_time": "10m 14s"} +{"loss": 0.34519873, "grad_norm": 0.25784447, "learning_rate": 1.744e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11379, "rewards/chosen": 17.5, "rewards/rejected": 0.62109375, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -197.0, "logps/chosen": -412.0, "logits/rejected": -1.9140625, "logits/chosen": -1.671875, "nll_loss": 0.46875, "epoch": 3.7, "global_step/max_steps": "185/250", "percentage": "74.00%", "elapsed_time": "27m 1s", "remaining_time": "9m 29s"} +{"loss": 0.33051834, "grad_norm": 0.3269795, "learning_rate": 1.5e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114169, "rewards/chosen": 16.5, "rewards/rejected": -0.5, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -560.0, "logps/chosen": -239.0, "logits/rejected": -1.859375, "logits/chosen": -1.984375, "nll_loss": 0.47851562, "epoch": 3.8, "global_step/max_steps": "190/250", "percentage": "76.00%", "elapsed_time": "27m 39s", "remaining_time": "8m 44s"} +{"loss": 0.28177929, "grad_norm": 0.4692309, "learning_rate": 1.271e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113877, "rewards/chosen": 18.75, "rewards/rejected": -0.9609375, "rewards/accuracies": 1.0, "rewards/margins": 19.625, "logps/rejected": -524.0, "logps/chosen": -346.0, "logits/rejected": -1.7421875, "logits/chosen": -1.5390625, "nll_loss": 0.3359375, "epoch": 3.9, "global_step/max_steps": "195/250", "percentage": "78.00%", "elapsed_time": "28m 27s", "remaining_time": "8m 1s"} +{"loss": 0.3228662, "grad_norm": 0.597351, "learning_rate": 1.059e-05, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113643, "rewards/chosen": 16.25, "rewards/rejected": -0.19921875, "rewards/accuracies": 1.0, "rewards/margins": 16.5, "logps/rejected": -524.0, "logps/chosen": -270.0, "logits/rejected": -1.5234375, "logits/chosen": -1.6015625, "nll_loss": 0.30859375, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "29m 15s", "remaining_time": "7m 18s"} +{"eval_loss": 0.44970703, "eval_runtime": 3.6961, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -9.75, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.78125, "eval_nll_loss": 0.42382812, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "29m 19s", "remaining_time": "7m 19s"} +{"loss": 0.35520964, "grad_norm": 0.32154523, "learning_rate": 8.63e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113, "rewards/chosen": 16.25, "rewards/rejected": 0.890625, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -564.0, "logps/chosen": -318.0, "logits/rejected": -1.7734375, "logits/chosen": -1.7890625, "nll_loss": 0.328125, "epoch": 4.1, "global_step/max_steps": "205/250", "percentage": "82.00%", "elapsed_time": "30m 9s", "remaining_time": "6m 37s"} +{"loss": 0.29902601, "grad_norm": 0.40259344, "learning_rate": 6.87e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.11323, "rewards/chosen": 18.5, "rewards/rejected": 0.9375, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -215.0, "logps/chosen": -248.0, "logits/rejected": -2.0, "logits/chosen": -1.7890625, "nll_loss": 0.3046875, "epoch": 4.2, "global_step/max_steps": "210/250", "percentage": "84.00%", "elapsed_time": "30m 50s", "remaining_time": "5m 52s"} +{"loss": 0.31557293, "grad_norm": 0.42590007, "learning_rate": 5.29e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113298, "rewards/chosen": 13.625, "rewards/rejected": -1.359375, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -896.0, "logps/chosen": -143.0, "logits/rejected": -1.5078125, "logits/chosen": -1.8984375, "nll_loss": 0.19238281, "epoch": 4.3, "global_step/max_steps": "215/250", "percentage": "86.00%", "elapsed_time": "31m 33s", "remaining_time": "5m 8s"} +{"loss": 0.28976035, "grad_norm": 0.22959233, "learning_rate": 3.9e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113169, "rewards/chosen": 11.9375, "rewards/rejected": -2.640625, "rewards/accuracies": 1.0, "rewards/margins": 14.5625, "logps/rejected": -800.0, "logps/chosen": -78.5, "logits/rejected": -1.5, "logits/chosen": -1.796875, "nll_loss": 0.125, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "32m 19s", "remaining_time": "4m 24s"} +{"eval_loss": 0.45288086, "eval_runtime": 3.7921, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.264, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -10.125, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.75, "eval_nll_loss": 0.43945312, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "32m 23s", "remaining_time": "4m 24s"} +{"loss": 0.33482242, "grad_norm": 0.42544041, "learning_rate": 2.72e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112881, "rewards/chosen": 23.875, "rewards/rejected": 0.87890625, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -220.0, "logps/chosen": -524.0, "logits/rejected": -1.9296875, "logits/chosen": -1.75, "nll_loss": 0.5546875, "epoch": 4.5, "global_step/max_steps": "225/250", "percentage": "90.00%", "elapsed_time": "33m 8s", "remaining_time": "3m 40s"} +{"loss": 0.29665933, "grad_norm": 0.59837386, "learning_rate": 1.75e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112845, "rewards/chosen": 17.375, "rewards/rejected": -0.33007812, "rewards/accuracies": 1.0, "rewards/margins": 17.75, "logps/rejected": -684.0, "logps/chosen": -247.0, "logits/rejected": -1.8046875, "logits/chosen": -1.71875, "nll_loss": 0.31445312, "epoch": 4.6, "global_step/max_steps": "230/250", "percentage": "92.00%", "elapsed_time": "33m 53s", "remaining_time": "2m 56s"} +{"loss": 0.26414022, "grad_norm": 0.47143059, "learning_rate": 9.9e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112889, "rewards/chosen": 12.375, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 15.4375, "logps/rejected": -608.0, "logps/chosen": -184.0, "logits/rejected": -1.6796875, "logits/chosen": -1.7265625, "nll_loss": 0.23925781, "epoch": 4.7, "global_step/max_steps": "235/250", "percentage": "94.00%", "elapsed_time": "34m 37s", "remaining_time": "2m 12s"} +{"loss": 0.31013832, "grad_norm": 0.39022287, "learning_rate": 4.4e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.11322, "rewards/chosen": 19.5, "rewards/rejected": 0.55078125, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -608.0, "logps/chosen": -374.0, "logits/rejected": -2.046875, "logits/chosen": -2.046875, "nll_loss": 0.55078125, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "35m 15s", "remaining_time": "1m 28s"} +{"eval_loss": 0.45703125, "eval_runtime": 3.7647, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.875, "eval_logps/rejected": -233.0, "eval_logps/chosen": -10.5, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.7421875, "eval_nll_loss": 0.45507812, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "35m 19s", "remaining_time": "1m 28s"} +{"loss": 0.32958794, "grad_norm": 0.21408465, "learning_rate": 1.1e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113152, "rewards/chosen": 18.0, "rewards/rejected": 0.66796875, "rewards/accuracies": 1.0, "rewards/margins": 17.375, "logps/rejected": -490.0, "logps/chosen": -450.0, "logits/rejected": -1.9921875, "logits/chosen": -1.6875, "nll_loss": 0.453125, "epoch": 4.9, "global_step/max_steps": "245/250", "percentage": "98.00%", "elapsed_time": "36m 0s", "remaining_time": "44s"} +{"loss": 0.3642045, "grad_norm": 0.17733369, "learning_rate": 0.0, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113433, "rewards/chosen": 15.875, "rewards/rejected": -1.109375, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -408.0, "logps/chosen": -332.0, "logits/rejected": -1.90625, "logits/chosen": -1.796875, "nll_loss": 0.37695312, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 39s", "remaining_time": "0s"} +{"eval_loss": 0.45654297, "eval_runtime": 3.7077, "eval_samples_per_second": 1.079, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -10.3125, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.7421875, "eval_nll_loss": 0.44921875, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 43s", "remaining_time": "0s"} +{"train_runtime": 2205.788, "train_samples_per_second": 0.895, "train_steps_per_second": 0.113, "total_flos": 725771410997248.0, "train_loss": 0.49011184, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 45s", "remaining_time": "0s"} +{"train_dataset": "1172.215190±496.010190, min=300.000000, max=4173.000000, size=395", "val_dataset": "1183.750000±508.140421, min=717.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 32830.9852M Params (67.1089M Trainable [0.2044%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-250", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/checkpoint-120", "best_metric": 0.41748047, "global_step": 250, "log_history": [{"loss": 2.1806640625, "grad_norm": 4.6390440726440465, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.068257, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -127.0, "logps/chosen": -482.0, "logits/rejected": -1.7265625, "logits/chosen": -1.6328125, "nll_loss": 0.4765625, "epoch": 0.02, "step": 1}, {"loss": 2.076416015625, "grad_norm": 4.257369809058118, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 19.96, "train_speed(iter/s)": 0.112369, "rewards/chosen": 0.0250244140625, "rewards/rejected": 0.0, "rewards/accuracies": 0.25, "rewards/margins": 0.0250244140625, "logps/rejected": -208.0, "logps/chosen": -552.0, "logits/rejected": -1.375, "logits/chosen": -1.1875, "nll_loss": 0.87109375, "epoch": 0.1, "step": 5}, {"loss": 1.82529296875, "grad_norm": 3.5963642842176267, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 25.78, "train_speed(iter/s)": 0.111366, "rewards/chosen": 9.775161743164062e-05, "rewards/rejected": 0.06005859375, "rewards/accuracies": 0.4000000059604645, "rewards/margins": -0.05908203125, "logps/rejected": -474.0, "logps/chosen": -494.0, "logits/rejected": -1.7734375, "logits/chosen": -1.640625, "nll_loss": 1.2109375, "epoch": 0.2, "step": 10}, {"loss": 1.5294921875, "grad_norm": 3.3753807300891543, "learning_rate": 9.998242976313776e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.106324, "rewards/chosen": 1.9453125, "rewards/rejected": 0.359375, "rewards/accuracies": 1.0, "rewards/margins": 1.5859375, "logps/rejected": -900.0, "logps/chosen": -432.0, "logits/rejected": -1.359375, "logits/chosen": -1.5390625, "nll_loss": 1.1015625, "epoch": 0.3, "step": 15}, {"loss": 1.016650390625, "grad_norm": 0.6730769877578879, "learning_rate": 9.97849063861667e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.108726, "rewards/chosen": 5.4375, "rewards/rejected": 4.71875, "rewards/accuracies": 0.800000011920929, "rewards/margins": 0.7265625, "logps/rejected": -266.0, "logps/chosen": -312.0, "logits/rejected": -1.640625, "logits/chosen": -1.8671875, "nll_loss": 0.482421875, "epoch": 0.4, "step": 20}, {"eval_loss": 0.5458984375, "eval_runtime": 3.7447, "eval_samples_per_second": 1.068, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.25, "eval_rewards/rejected": 5.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 2.5625, "eval_logps/rejected": -151.0, "eval_logps/chosen": -11.375, "eval_logits/rejected": -1.65625, "eval_logits/chosen": -1.5546875, "eval_nll_loss": 0.494140625, "epoch": 0.4, "step": 20}, {"loss": 0.612744140625, "grad_norm": 0.4362878986584245, "learning_rate": 9.936876709681668e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.10794, "rewards/chosen": 9.125, "rewards/rejected": 3.84375, "rewards/accuracies": 1.0, "rewards/margins": 5.25, "logps/rejected": -330.0, "logps/chosen": -255.0, "logits/rejected": -1.421875, "logits/chosen": -1.578125, "nll_loss": 0.396484375, "epoch": 0.5, "step": 25}, {"loss": 0.53232421875, "grad_norm": 0.262862568645726, "learning_rate": 9.873583924954152e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.111231, "rewards/chosen": 8.75, "rewards/rejected": 1.625, "rewards/accuracies": 1.0, "rewards/margins": 7.15625, "logps/rejected": -568.0, "logps/chosen": -316.0, "logits/rejected": -1.9375, "logits/chosen": -1.9140625, "nll_loss": 0.59765625, "epoch": 0.6, "step": 30}, {"loss": 0.537908935546875, "grad_norm": 0.5291957986954565, "learning_rate": 9.788890216258939e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.114244, "rewards/chosen": 11.25, "rewards/rejected": 0.41015625, "rewards/accuracies": 1.0, "rewards/margins": 10.8125, "logps/rejected": -234.0, "logps/chosen": -442.0, "logits/rejected": -1.9609375, "logits/chosen": -1.7890625, "nll_loss": 0.71484375, "epoch": 0.7, "step": 35}, {"loss": 0.45433349609375, "grad_norm": 0.7105850587888475, "learning_rate": 9.68316749134364e-05, "memory(GiB)": 45.61, "train_speed(iter/s)": 0.112752, "rewards/chosen": 9.625, "rewards/rejected": 1.015625, "rewards/accuracies": 1.0, "rewards/margins": 8.625, "logps/rejected": -320.0, "logps/chosen": -171.0, "logits/rejected": -1.859375, "logits/chosen": -2.046875, "nll_loss": 0.5078125, "epoch": 0.8, "step": 40}, {"eval_loss": 0.447265625, "eval_runtime": 3.7694, "eval_samples_per_second": 1.061, "eval_steps_per_second": 0.265, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": 1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.4375, "eval_logps/rejected": -196.0, "eval_logps/chosen": -8.0625, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -2.03125, "eval_nll_loss": 0.3515625, "epoch": 0.8, "step": 40}, {"loss": 0.44189453125, "grad_norm": 0.37874190112712086, "learning_rate": 9.55688000075414e-05, "memory(GiB)": 49.44, "train_speed(iter/s)": 0.111028, "rewards/chosen": 15.875, "rewards/rejected": 0.8828125, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -418.0, "logps/chosen": -382.0, "logits/rejected": -2.0625, "logits/chosen": -1.9375, "nll_loss": 0.44921875, "epoch": 0.9, "step": 45}, {"loss": 0.4344970703125, "grad_norm": 2.6986237924570067, "learning_rate": 9.410582299213573e-05, "memory(GiB)": 50.89, "train_speed(iter/s)": 0.112328, "rewards/chosen": 11.25, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 9.1875, "logps/rejected": -936.0, "logps/chosen": -384.0, "logits/rejected": -1.515625, "logits/chosen": -1.7109375, "nll_loss": 0.390625, "epoch": 1.0, "step": 50}, {"loss": 0.47237548828125, "grad_norm": 0.19932517409606404, "learning_rate": 9.244916810456821e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.112694, "rewards/chosen": 14.25, "rewards/rejected": 1.5703125, "rewards/accuracies": 1.0, "rewards/margins": 12.6875, "logps/rejected": -592.0, "logps/chosen": -318.0, "logits/rejected": -1.953125, "logits/chosen": -1.890625, "nll_loss": 0.48828125, "epoch": 1.1, "step": 55}, {"loss": 0.4276611328125, "grad_norm": 0.16754375656348677, "learning_rate": 9.060611006213832e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113658, "rewards/chosen": 12.8125, "rewards/rejected": 0.050048828125, "rewards/accuracies": 1.0, "rewards/margins": 12.75, "logps/rejected": -490.0, "logps/chosen": -180.0, "logits/rejected": -1.71875, "logits/chosen": -1.9140625, "nll_loss": 0.322265625, "epoch": 1.2, "step": 60}, {"eval_loss": 0.43310546875, "eval_runtime": 3.7111, "eval_samples_per_second": 1.078, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.25, "eval_logps/rejected": -214.0, "eval_logps/chosen": -7.34375, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.953125, "eval_nll_loss": 0.318359375, "epoch": 1.2, "step": 60}, {"loss": 0.471875, "grad_norm": 0.26807398908522023, "learning_rate": 8.858474211729469e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11268, "rewards/chosen": 11.5625, "rewards/rejected": 0.890625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -664.0, "logps/chosen": -258.0, "logits/rejected": -1.6015625, "logits/chosen": -1.7734375, "nll_loss": 0.482421875, "epoch": 1.3, "step": 65}, {"loss": 0.4139404296875, "grad_norm": 0.23170654825310766, "learning_rate": 8.639394051847472e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113965, "rewards/chosen": 9.25, "rewards/rejected": 2.15625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -844.0, "logps/chosen": -51.5, "logits/rejected": -1.578125, "logits/chosen": -2.078125, "nll_loss": 0.515625, "epoch": 1.4, "step": 70}, {"loss": 0.4178466796875, "grad_norm": 0.26967258360728136, "learning_rate": 8.404332553264547e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114333, "rewards/chosen": 14.1875, "rewards/rejected": 0.3671875, "rewards/accuracies": 1.0, "rewards/margins": 13.8125, "logps/rejected": -364.0, "logps/chosen": -370.0, "logits/rejected": -1.9375, "logits/chosen": -1.796875, "nll_loss": 0.37890625, "epoch": 1.5, "step": 75}, {"loss": 0.4442626953125, "grad_norm": 0.18507453378934124, "learning_rate": 8.154321920070414e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.115538, "rewards/chosen": 16.75, "rewards/rejected": 1.53125, "rewards/accuracies": 1.0, "rewards/margins": 15.25, "logps/rejected": -128.0, "logps/chosen": -446.0, "logits/rejected": -1.9375, "logits/chosen": -1.6875, "nll_loss": 0.53515625, "epoch": 1.6, "step": 80}, {"eval_loss": 0.42626953125, "eval_runtime": 3.7179, "eval_samples_per_second": 1.076, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -228.0, "eval_logps/chosen": -6.90625, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8203125, "eval_nll_loss": 0.30078125, "epoch": 1.6, "step": 80}, {"loss": 0.5656494140625, "grad_norm": 0.4659913406853354, "learning_rate": 7.890460001124242e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114522, "rewards/chosen": 12.1875, "rewards/rejected": -1.3671875, "rewards/accuracies": 1.0, "rewards/margins": 13.5625, "logps/rejected": -636.0, "logps/chosen": -236.0, "logits/rejected": -1.78125, "logits/chosen": -1.84375, "nll_loss": 0.39453125, "epoch": 1.7, "step": 85}, {"loss": 0.41357421875, "grad_norm": 0.3257493171354355, "learning_rate": 7.613905469171246e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113825, "rewards/chosen": 13.125, "rewards/rejected": -0.291015625, "rewards/accuracies": 1.0, "rewards/margins": 13.4375, "logps/rejected": -434.0, "logps/chosen": -236.0, "logits/rejected": -1.921875, "logits/chosen": -1.765625, "nll_loss": 0.439453125, "epoch": 1.8, "step": 90}, {"loss": 0.4584716796875, "grad_norm": 0.2864922683958331, "learning_rate": 7.325872732868869e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113891, "rewards/chosen": 15.0625, "rewards/rejected": 0.921875, "rewards/accuracies": 1.0, "rewards/margins": 14.1875, "logps/rejected": -456.0, "logps/chosen": -286.0, "logits/rejected": -1.578125, "logits/chosen": -1.8359375, "nll_loss": 0.37890625, "epoch": 1.9, "step": 95}, {"loss": 0.405670166015625, "grad_norm": 0.2579590962972889, "learning_rate": 7.027626604064969e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11369, "rewards/chosen": 12.0625, "rewards/rejected": 0.62109375, "rewards/accuracies": 1.0, "rewards/margins": 11.4375, "logps/rejected": -292.0, "logps/chosen": -221.0, "logits/rejected": -1.6171875, "logits/chosen": -1.828125, "nll_loss": 0.359375, "epoch": 2.0, "step": 100}, {"eval_loss": 0.4228515625, "eval_runtime": 3.7471, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.875, "eval_logps/rejected": -220.0, "eval_logps/chosen": -6.875, "eval_logits/rejected": -1.5703125, "eval_logits/chosen": -1.859375, "eval_nll_loss": 0.298828125, "epoch": 2.0, "step": 100}, {"loss": 0.42579345703125, "grad_norm": 0.280595138331498, "learning_rate": 6.720476743745072e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.112746, "rewards/chosen": 15.5, "rewards/rejected": 1.28125, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -556.0, "logps/chosen": -360.0, "logits/rejected": -2.0, "logits/chosen": -1.9453125, "nll_loss": 0.5390625, "epoch": 2.1, "step": 105}, {"loss": 0.41505126953125, "grad_norm": 0.33433345487081023, "learning_rate": 6.405771911037699e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.1128, "rewards/chosen": 13.5, "rewards/rejected": 1.640625, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -836.0, "logps/chosen": -330.0, "logits/rejected": -1.4765625, "logits/chosen": -1.5390625, "nll_loss": 0.3671875, "epoch": 2.2, "step": 110}, {"loss": 0.443218994140625, "grad_norm": 0.173944606658398, "learning_rate": 6.08489404053159e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113012, "rewards/chosen": 14.8125, "rewards/rejected": 1.484375, "rewards/accuracies": 1.0, "rewards/margins": 13.3125, "logps/rejected": -588.0, "logps/chosen": -424.0, "logits/rejected": -1.7578125, "logits/chosen": -1.703125, "nll_loss": 0.41015625, "epoch": 2.3, "step": 115}, {"loss": 0.3655853271484375, "grad_norm": 0.30679422743634893, "learning_rate": 5.7592521739125726e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113772, "rewards/chosen": 13.125, "rewards/rejected": 0.490234375, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -292.0, "logps/chosen": -310.0, "logits/rejected": -1.84375, "logits/chosen": -1.8046875, "nll_loss": 0.314453125, "epoch": 2.4, "step": 120}, {"eval_loss": 0.41748046875, "eval_runtime": 3.747, "eval_samples_per_second": 1.068, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.75, "eval_rewards/rejected": -1.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5625, "eval_logps/rejected": -226.0, "eval_logps/chosen": -6.53125, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8125, "eval_nll_loss": 0.28515625, "epoch": 2.4, "step": 120}, {"loss": 0.3992919921875, "grad_norm": 0.320148655196901, "learning_rate": 5.430276272567485e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113708, "rewards/chosen": 10.5, "rewards/rejected": -3.796875, "rewards/accuracies": 1.0, "rewards/margins": 14.3125, "logps/rejected": -944.0, "logps/chosen": -38.25, "logits/rejected": -1.7109375, "logits/chosen": -2.15625, "nll_loss": 0.4609375, "epoch": 2.5, "step": 125}, {"loss": 0.3893707275390625, "grad_norm": 0.4365836857173622, "learning_rate": 5.0994109383253506e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114337, "rewards/chosen": 17.25, "rewards/rejected": 0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 16.75, "logps/rejected": -249.0, "logps/chosen": -324.0, "logits/rejected": -1.9140625, "logits/chosen": -1.8671875, "nll_loss": 0.412109375, "epoch": 2.6, "step": 130}, {"loss": 0.35826416015625, "grad_norm": 0.26221498737417476, "learning_rate": 4.768109069909307e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114575, "rewards/chosen": 11.0625, "rewards/rejected": -1.078125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -524.0, "logps/chosen": -125.0, "logits/rejected": -1.765625, "logits/chosen": -1.8671875, "nll_loss": 0.19140625, "epoch": 2.7, "step": 135}, {"loss": 0.39818534851074217, "grad_norm": 0.21875233894682816, "learning_rate": 4.4378254829551396e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.115122, "rewards/chosen": 14.375, "rewards/rejected": -1.2421875, "rewards/accuracies": 1.0, "rewards/margins": 15.625, "logps/rejected": -604.0, "logps/chosen": -288.0, "logits/rejected": -1.953125, "logits/chosen": -1.9921875, "nll_loss": 0.546875, "epoch": 2.8, "step": 140}, {"eval_loss": 0.42333984375, "eval_runtime": 3.7658, "eval_samples_per_second": 1.062, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.5, "eval_logps/rejected": -226.0, "eval_logps/chosen": -7.1875, "eval_logits/rejected": -1.59375, "eval_logits/chosen": -1.828125, "eval_nll_loss": 0.3125, "epoch": 2.8, "step": 140}, {"loss": 0.4264970779418945, "grad_norm": 0.5655093745020108, "learning_rate": 4.11001052161225e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113855, "rewards/chosen": 15.0625, "rewards/rejected": 0.90234375, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -552.0, "logps/chosen": -478.0, "logits/rejected": -1.8984375, "logits/chosen": -1.765625, "nll_loss": 0.55078125, "epoch": 2.9, "step": 145}, {"loss": 0.321661376953125, "grad_norm": 0.39937567960730846, "learning_rate": 3.786103689779861e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11347, "rewards/chosen": 19.5, "rewards/rejected": 1.71875, "rewards/accuracies": 1.0, "rewards/margins": 17.75, "logps/rejected": -251.0, "logps/chosen": -344.0, "logits/rejected": -1.9296875, "logits/chosen": -1.5390625, "nll_loss": 0.326171875, "epoch": 3.0, "step": 150}, {"loss": 0.40140838623046876, "grad_norm": 0.28789168702712753, "learning_rate": 3.467527329945026e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113762, "rewards/chosen": 16.375, "rewards/rejected": 0.6796875, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -506.0, "logps/chosen": -446.0, "logits/rejected": -1.78125, "logits/chosen": -1.625, "nll_loss": 0.3828125, "epoch": 3.1, "step": 155}, {"loss": 0.3630828857421875, "grad_norm": 0.28174536188893384, "learning_rate": 3.1556803773799614e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113646, "rewards/chosen": 18.0, "rewards/rejected": 1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 16.625, "logps/rejected": -322.0, "logps/chosen": -378.0, "logits/rejected": -2.171875, "logits/chosen": -1.90625, "nll_loss": 0.359375, "epoch": 3.2, "step": 160}, {"eval_loss": 0.437744140625, "eval_runtime": 3.7498, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.5625, "eval_rewards/rejected": -2.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.625, "eval_logps/rejected": -229.0, "eval_logps/chosen": -8.5625, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.7578125, "eval_nll_loss": 0.373046875, "epoch": 3.2, "step": 160}, {"loss": 0.33818817138671875, "grad_norm": 1.0577057175486542, "learning_rate": 2.8519322171253602e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113429, "rewards/chosen": 12.375, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 14.4375, "logps/rejected": -418.0, "logps/chosen": -132.0, "logits/rejected": -1.6953125, "logits/chosen": -1.609375, "nll_loss": 0.138671875, "epoch": 3.3, "step": 165}, {"loss": 0.3404090881347656, "grad_norm": 0.38519303074023925, "learning_rate": 2.5576166707349385e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113152, "rewards/chosen": 22.0, "rewards/rejected": 1.3125, "rewards/accuracies": 1.0, "rewards/margins": 20.625, "logps/rejected": -159.0, "logps/chosen": -498.0, "logits/rejected": -1.9296875, "logits/chosen": -1.5390625, "nll_loss": 0.341796875, "epoch": 3.4, "step": 170}, {"loss": 0.3107784271240234, "grad_norm": 0.5718561646417449, "learning_rate": 2.2740261391866637e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113326, "rewards/chosen": 16.125, "rewards/rejected": 1.3359375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -436.0, "logps/chosen": -302.0, "logits/rejected": -1.6640625, "logits/chosen": -1.6015625, "nll_loss": 0.43359375, "epoch": 3.5, "step": 175}, {"loss": 0.39559040069580076, "grad_norm": 0.30064476351594455, "learning_rate": 2.002405927680374e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113857, "rewards/chosen": 18.375, "rewards/rejected": 0.3203125, "rewards/accuracies": 1.0, "rewards/margins": 18.0, "logps/rejected": -247.0, "logps/chosen": -418.0, "logits/rejected": -2.03125, "logits/chosen": -1.6875, "nll_loss": 0.494140625, "epoch": 3.6, "step": 180}, {"eval_loss": 0.44775390625, "eval_runtime": 3.7431, "eval_samples_per_second": 1.069, "eval_steps_per_second": 0.267, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -2.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.5, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.78125, "eval_nll_loss": 0.4140625, "epoch": 3.6, "step": 180}, {"loss": 0.3451987266540527, "grad_norm": 0.25784447177312775, "learning_rate": 1.743948777242814e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.11379, "rewards/chosen": 17.5, "rewards/rejected": 0.62109375, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -197.0, "logps/chosen": -412.0, "logits/rejected": -1.9140625, "logits/chosen": -1.671875, "nll_loss": 0.46875, "epoch": 3.7, "step": 185}, {"loss": 0.3305183410644531, "grad_norm": 0.3269795021594465, "learning_rate": 1.4997896271528739e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.114169, "rewards/chosen": 16.5, "rewards/rejected": -0.5, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -560.0, "logps/chosen": -239.0, "logits/rejected": -1.859375, "logits/chosen": -1.984375, "nll_loss": 0.478515625, "epoch": 3.8, "step": 190}, {"loss": 0.28177928924560547, "grad_norm": 0.46923090007407836, "learning_rate": 1.2710006311864104e-05, "memory(GiB)": 61.51, "train_speed(iter/s)": 0.113877, "rewards/chosen": 18.75, "rewards/rejected": -0.9609375, "rewards/accuracies": 1.0, "rewards/margins": 19.625, "logps/rejected": -524.0, "logps/chosen": -346.0, "logits/rejected": -1.7421875, "logits/chosen": -1.5390625, "nll_loss": 0.3359375, "epoch": 3.9, "step": 195}, {"loss": 0.32286620140075684, "grad_norm": 0.5973510030321513, "learning_rate": 1.0585864495652897e-05, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113643, "rewards/chosen": 16.25, "rewards/rejected": -0.19921875, "rewards/accuracies": 1.0, "rewards/margins": 16.5, "logps/rejected": -524.0, "logps/chosen": -270.0, "logits/rejected": -1.5234375, "logits/chosen": -1.6015625, "nll_loss": 0.30859375, "epoch": 4.0, "step": 200}, {"eval_loss": 0.44970703125, "eval_runtime": 3.6961, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -9.75, "eval_logits/rejected": -1.6015625, "eval_logits/chosen": -1.78125, "eval_nll_loss": 0.423828125, "epoch": 4.0, "step": 200}, {"loss": 0.3552096366882324, "grad_norm": 0.3215452292580796, "learning_rate": 8.634798372847148e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113, "rewards/chosen": 16.25, "rewards/rejected": 0.890625, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -564.0, "logps/chosen": -318.0, "logits/rejected": -1.7734375, "logits/chosen": -1.7890625, "nll_loss": 0.328125, "epoch": 4.1, "step": 205}, {"loss": 0.2990260124206543, "grad_norm": 0.4025934432388528, "learning_rate": 6.865375481914016e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.11323, "rewards/chosen": 18.5, "rewards/rejected": 0.9375, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -215.0, "logps/chosen": -248.0, "logits/rejected": -2.0, "logits/chosen": -1.7890625, "nll_loss": 0.3046875, "epoch": 4.2, "step": 210}, {"loss": 0.31557292938232423, "grad_norm": 0.4259000697068638, "learning_rate": 5.285365727986707e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113298, "rewards/chosen": 13.625, "rewards/rejected": -1.359375, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -896.0, "logps/chosen": -143.0, "logits/rejected": -1.5078125, "logits/chosen": -1.8984375, "nll_loss": 0.1923828125, "epoch": 4.3, "step": 215}, {"loss": 0.2897603511810303, "grad_norm": 0.22959233205564938, "learning_rate": 3.901707263589671e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113169, "rewards/chosen": 11.9375, "rewards/rejected": -2.640625, "rewards/accuracies": 1.0, "rewards/margins": 14.5625, "logps/rejected": -800.0, "logps/chosen": -78.5, "logits/rejected": -1.5, "logits/chosen": -1.796875, "nll_loss": 0.125, "epoch": 4.4, "step": 220}, {"eval_loss": 0.452880859375, "eval_runtime": 3.7921, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.264, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -10.125, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.75, "eval_nll_loss": 0.439453125, "epoch": 4.4, "step": 220}, {"loss": 0.334822416305542, "grad_norm": 0.4254404063804997, "learning_rate": 2.7204760217631074e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112881, "rewards/chosen": 23.875, "rewards/rejected": 0.87890625, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -220.0, "logps/chosen": -524.0, "logits/rejected": -1.9296875, "logits/chosen": -1.75, "nll_loss": 0.5546875, "epoch": 4.5, "step": 225}, {"loss": 0.2966593265533447, "grad_norm": 0.5983738617042668, "learning_rate": 1.7468590353731495e-06, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112845, "rewards/chosen": 17.375, "rewards/rejected": -0.330078125, "rewards/accuracies": 1.0, "rewards/margins": 17.75, "logps/rejected": -684.0, "logps/chosen": -247.0, "logits/rejected": -1.8046875, "logits/chosen": -1.71875, "nll_loss": 0.314453125, "epoch": 4.6, "step": 230}, {"loss": 0.2641402244567871, "grad_norm": 0.4714305885244571, "learning_rate": 9.851316597681958e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.112889, "rewards/chosen": 12.375, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 15.4375, "logps/rejected": -608.0, "logps/chosen": -184.0, "logits/rejected": -1.6796875, "logits/chosen": -1.7265625, "nll_loss": 0.2392578125, "epoch": 4.7, "step": 235}, {"loss": 0.31013832092285154, "grad_norm": 0.3902228714814237, "learning_rate": 4.386387988014273e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.11322, "rewards/chosen": 19.5, "rewards/rejected": 0.55078125, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -608.0, "logps/chosen": -374.0, "logits/rejected": -2.046875, "logits/chosen": -2.046875, "nll_loss": 0.55078125, "epoch": 4.8, "step": 240}, {"eval_loss": 0.45703125, "eval_runtime": 3.7647, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.875, "eval_logps/rejected": -233.0, "eval_logps/chosen": -10.5, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.7421875, "eval_nll_loss": 0.455078125, "epoch": 4.8, "step": 240}, {"loss": 0.3295879364013672, "grad_norm": 0.21408464592337356, "learning_rate": 1.0978021666005478e-07, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113152, "rewards/chosen": 18.0, "rewards/rejected": 0.66796875, "rewards/accuracies": 1.0, "rewards/margins": 17.375, "logps/rejected": -490.0, "logps/chosen": -450.0, "logits/rejected": -1.9921875, "logits/chosen": -1.6875, "nll_loss": 0.453125, "epoch": 4.9, "step": 245}, {"loss": 0.36420450210571287, "grad_norm": 0.1773336882227468, "learning_rate": 0.0, "memory(GiB)": 73.34, "train_speed(iter/s)": 0.113433, "rewards/chosen": 15.875, "rewards/rejected": -1.109375, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -408.0, "logps/chosen": -332.0, "logits/rejected": -1.90625, "logits/chosen": -1.796875, "nll_loss": 0.376953125, "epoch": 5.0, "step": 250}, {"eval_loss": 0.45654296875, "eval_runtime": 3.7077, "eval_samples_per_second": 1.079, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -2.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/rejected": -231.0, "eval_logps/chosen": -10.3125, "eval_logits/rejected": -1.609375, "eval_logits/chosen": -1.7421875, "eval_nll_loss": 0.44921875, "epoch": 5.0, "step": 250}, {"train_runtime": 2205.788, "train_samples_per_second": 0.895, "train_steps_per_second": 0.113, "total_flos": 725771410997248.0, "train_loss": 0.4901118354797363, "epoch": 5.0, "step": 250}], "memory": 73.337890625} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs/events.out.tfevents.1737729092.kml-task-547024-record-9965643-prod-worker-0.38511.0 b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs/events.out.tfevents.1737729092.kml-task-547024-record-9965643-prod-worker-0.38511.0 new file mode 100644 index 0000000000000000000000000000000000000000..7210dc402d2f574d5e18891b8d7f5d0cf30c9b16 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-142955/runs/events.out.tfevents.1737729092.kml-task-547024-record-9965643-prod-worker-0.38511.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b401252c20bb30dc85254b7f9a47ccf2c24a8237b0140da1f99856d9e0468b7c +size 61676 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f10c3d39625c25a2d25d64e743b5a1481dd3574c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:848152afdca40c72603423fbdce55b2cbcc178e0a4ac2c3e0c52a0702f63c88c +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0289e38c16634afc67ddd258c15296011474b6a7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6055dcf32c8c3e6835c73dd862112b49d7af2bb29f4889141878cee9eccc591 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5788786f8746dbac3c064802f59a12010f0dae83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb3aa182a99b45bde76308385c650e18df7b5e83f672b5cf8ea98a4e808a770b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..704a9b170f7305cd4d8466b62a2e5e87b62ce0d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e98464c6b8000ff25455c95ffdef693ca488c9ead6ae264cb9f2d08f063baba +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b69337723e1347a0faff876ce68752a673a63349 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:224f55ae6e8d1af55d65094d8e04bae89bafcdf67bbb941d52287b7ec991e362 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d15827aec4d71bbbf5353467f5942a0385bc9aa4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b8ffb4b57e2acac6f3957e6bb8c198306c1f06531553a6b6420883c422886c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c47852fb6662e189450648e5cacd74997e4eeb85 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a5dd9add2cb25418319455e7d5f02063135bdf8963172df8b452d95d72f067 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cec76ddcd6fae4e970b0fef7bb9921918afedba1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f29c77562fc0ba3116dff7b5c377a02d828652607efc022f6ee82bb5434acd4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f457ed2193c2ba4930324cb6122afe071df798ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ddd9b0a38aab001fa75f2b124f5768ac39dff6a2024d2b2aa8f6c9bfca0a7e7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..590f5f9a24593f3ff9d06d041aef221737dd0206 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4cc80537f02f37cf1f81e75069a7940716041598f661e1f5c38d639ab70d501 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93bcd6ebfe133201ff2d0edfd5bf5d9072e1a70c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189b2628efe9b8e8bcd258696b515ec56e79e4a091ce991941e4c2de3f2729ce +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8ad9eb370d777d1a427eb8178fded39c3a56db7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6434d462b37661cd4e32d2c0d4b7673ee44c13c7f5f6a59eba992c9c1c69e274 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a162a06b3fc99250f9d3d102d5af2aafc330c67b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38daf6c27b9716373ec351071e778fdb64eefd3d1e95bb6c6a8162d0f07484a7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a0c2ede703f28e7ac4ca711a4c20959f9c6cbc2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b557dff718a0bd5e198584ec037a4e4aa4cd1a729afc691165e5a820d81093 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23eb9fa080e770a931bfe594532ad035995ebd67 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f57054d3c1a3ffb42857ac3baa055e084f48ecaeff4eac1c245b8c1967f4b302 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32c2064c2161c8ce8e4000304d05b0138ad329d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3679b565ace89aea01f4c60ec78b6ef80ce0f0aba4e046d989bada75931c26f3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62635ed5827e1e6b1ece213e1a7f847343abb2ab --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef333792be0fe6aa800f9126d09f64e73aa2775dd74b41e7e9475c8b09178cdb +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e44eb3fa45a557460f92c098f2fd1eccee781e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e2e2737d63e3807ccb5977a1e9e37b3ec45ffad7bcc727e9e137aebc01368 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af4dfb2158185e5877a7417ea0089d5bf35a0db1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.42919922, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 286911173754880.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..310bf1d0b9ba432daa6fca88236f11ad59e0afba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef3a796310d47e5e3b836778f1e81414306ae682891a77089671b7f7a6b3282f +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c373d06be00a6a1072fc40ba35e736ea8cc0892d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4020bd1fdee9ec8fb628c916818fd5626cc3828c94c3ed7aea1c9a02f0a02f58 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2953ba053ea84efe49c8d538bbcfd4cd5677c556 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56e56d65256302226a8ff1f51fda1941001b0046217daea4cea28af70d935ed4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b27c2e9cb9a283d058dd73e96927e0f308c96ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db570dcff1a17270f9996beda2b567946291d6cc4b8fbe80634386c8d5959eb2 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae61e07a9ea42dcf9f93f4c385179f24e0aff9b4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5c6c85ace97b0bddb13bc5c05fff0d454e325de3aaf6e4f3651e816d59a1d2 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7bf90b9dba7bde931fae2ec5011547847cd4b79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:693b3df85128352e0222d160750e355e01b1cd7071508d4ae62a15e03c50515e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e04cbdc4f7f164655b7c82f6d5d80b8d479f563 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bb50edb2dd880c9624f8e69781cae24b0f3fbe97757b68fdf9616376ce2eb1 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..637713622a914b1588b89b57a84fa5ca5692c4a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00d16891b192911b18ebe78a852fc6df396a54df0fc9b5f50297aa8ae3e81ca6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de44f0969976c95cc1a4d5f44231d23d3156e1cb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6ade62569ecc080d7baa9521fcaadb789962fcaa12a6f3ba5004aaf1703face +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e16cb0491da7505d2ba2332fecedff9a7579e984 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:100f0886b43b2152911a1ac3327c97b56722b06bc92ada601a8fe57d6cc40854 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..024d5c15c918e6eaa2d0b8af01b3761ea7c20435 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1391a4065acedb60cf55a6bd91044fc7f07cbdb641e2242c2f6d6fc20fdeace +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..400d1ce12574e9ada03b2833fe4174b63ed0b4da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b356be7ba26489c4adeda3fb9a910a6edebf269595e5ab46a6e69e0f71d8241 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1683f7d75254e9fc8704c0bb8761168ce5431fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d3f029357e3f766991cc0ce4b12129cad0a8fb127da99d9a1332c3eddbbc36c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc24d29149a8abe52631246348a03906dbadcafa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a8f78eca0f8d1c1252bed61fc196beedbba7250df4c24f81ee5a817556182ea +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b1e2fc30ee36f621407b54c3d8075f40ac298f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c286ca7c798e541441379635a95fd4a2aea4618eead75c5e34262a2f78c296f3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..300c87ddeb4265c795e925e5a888366dfa9e03e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace06d445c86c3aa8f283e8bb6a25163cba726f391e53a4739273d1d9c93ecf8 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c3ee0ef3dae8e8f83d0dc34dd365dea44f389ca --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:217b4bbeb0f6c27c35be0a992bc5ec3da12663daacdb8393fcb8ae5d8fc63d9e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..0e13e0563ec45a863d519305a1251d3e72b9e3e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/latest @@ -0,0 +1 @@ +global_step120 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056328cf4dbfbdfaf5b7ffa668b29852f77a3798 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b76da7ccfd8d1a286433da6127628e0c6a1565950b2dea51fe5864ad3e6545 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b438a7be0d981cd38820f7a8c3a4dae1f48f6b40 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.42285156, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120", + "epoch": 2.4, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 343801459113984.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fde705c2d3e545134dc69ee188d81b87903ac480 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95903de809ac1c6088febec2ba142c73c3284f24c238f66e3d963e60e106cf0 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e47c7a8b3902de533f8666219aa318cbad6cb52b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0036a0e6606ed59d603641d1d6f1d13bcccc98d88bd28819d4d34ad5e3f17377 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..917f6ead691d9339cddcf2442f369878d76477ee --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aacdd7cfe8b4c0a210d9d0c5a64e6507eba0903f9cc1189bcdea2bb86246686 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09b785f15cdd2b493afbbc7d46ab0a09a26c8eff --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ec80bf74c61aa334fb359917eab44d980076d2d0ede53b6b798bd948d5cc23b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d56e049dc737d9ae5d836c4e4228d25afa6f9d3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d20eb8f789bbe44505cf90dec3d4941bea880cd61ae61f915eb4f513b8749d +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20ffbfe2fecb36de10a98ac53f35ff5552a34c94 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d954abe72be0429e0c501e4081372d6562ff2e41789acd0910d43220d16de21f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e91e1011c1bd90bb737a5e52b8f2b6594ebc749 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d995c7ae310b1954593b7d709aac3d2bfe9fef4a555eaff4923155343bc5007 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a4bcc2ef7a457c66fc1a8610d1b6a620222a7f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e89d54a80b1980317f70532f4114dc5b5aba67062db7bdfd69a4217eb5b4bf9c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c47121b0d251bb8aa39fa856debe930a3383ee5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b909e6d648a7a18764a8e38dbe22063ba8873b9a6b22431eafab6a6a800f94 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50d669838e05c4a85ccb73e854384bdcc8f62f2a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba85c8e6fa53109c2a5c97fa2e1b9d8c7c3d84b77567f9500675a855668e185e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dd579d0732140cbcb929f7f0a7cfe4c10c2818f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56b37a0d8d9494259013fa81f51b508e86b9dc84196d71df3738b96f08222d31 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcf5dd610a401a3bc379d0662134b0b8f931916a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b284432aa1d88f0eaf60862a87281947df4723e200485f2dfa01836a7d8d55a3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f441a8dc7614691a07f046bd540fb7c3b412b864 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8be766ca056ab279d159dffe15fc4f3131211e59140623ac3a1abb4dbfba5e8 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..434165eb7eda7e71604d098da406bb2041accba8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dd08ddf0c5df9303b4274c8c2adbfe4c86fcaafeffb16f1af176c7a060d28a +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d6503afb248a341bc30bd2f63309e586b959731 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3684664a3e78e1537481f1067496eac1fba2e89b1ce73bad08e243622de889d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f515c6f0917b5aaed9b22759f281b1deb1d73d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d47f8fb62bb31fab4c711bad01567ee412ceea40b6a38f34bf9729d92ce2cc2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b06c5bb5db0506886581ceee08b5fb7d3c81c180 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a541685035a050fef28ae37acdf050af58cc745e3ba83f2b74af9c33a60b9f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd2b9aef86529798137c2868d556e873a23c785c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/latest @@ -0,0 +1 @@ +global_step140 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6e613ad02e1482b1eef52ff51329fe67d4fceb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9c57c64e42f5d7ec5b6fd8bf14122cd4f49a4ae907dcde9c057b79cc82e639 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba076a37c81737dfef79cec75e8d624e9dd321f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/trainer_state.json @@ -0,0 +1,674 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 2.8, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 398909924442112.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4f61aa4b631f838f1d44163136915544afae0c00 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08b757e7c754073d265e63e2d9332eb50c5067acbcdede645d19b51d171142c7 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f1eec1b4c42e6311b885728a417fa8be2b12609 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07f333a447dc9461a1c49fb334586c08c64bb0d8ff8b6abedb115008ca4050c0 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..308dbbf97f08157ec0fd4185e25afa1bde0cd583 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5703ec094ddea75fdd0d66e58ec9730e2981f448c4c97b13e510c8d095fc1337 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2a7e752f9136bc59f2047b7a503f591e2281a69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987586791f8ced6fbcc4a5b6f80fa29eb5224c06dc5a432ce6237c1af890a3d6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..230d3faf22f70265456b6c5fb2c5ce62adccddc0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb658181243c5f7cc85f6d4142562ff0a426a22adf40c906612835220ed2227 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75ea3db4327bd7559c28ba95f3a317e6e11526dd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfad3cffcc7d2a60cc4586e411ab1ee0261d68a37debde0c525def4d09c5c69c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8b05c831d4b718581f9050c257fd5c6130b6e59 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8fb3a413d6bc9f3a863754219fb765d0bb132516eead8817b9cec84d8caf21 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..035c71cada8d32f204b77a6fb63c4b01ffe8d5ee --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4fc556c6188e6ca5b00f8301120acba85a69bc4d1ec0f93b4fa9fc746a29b49 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..889763eb03ad4e2746d23f643e22243a870ea71f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2e42cb14304564db7a6ee30e3f6b68d76dba822b211664dbda1ba87ffd769f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92ef6f5d456213e36f8711ede0200ab8000b65ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ece968334e9a4e84cf1c26054607a358898bb7a9aa1e4987673b4c8eadf4fa +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0e105fd2b1b5ac4df22bab0a461339ef1dc0fb8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b736a1290f6de23470d030b1a237677ff66adcd19955adb340161fdeb71f3f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fb22f6a40099c6ad1d123a968c4c0032455f8c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e4e43fefe6ae28ed460816172fa6e2ee6dd3413fb4cab71be3adcb9c3b6658 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06bbeb209cc670e60aba299a61c5ea75582b6f92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:891c7c230f608025b4f6fa942f999287ceb2ef9ea064d66ef00ecb6c11b45138 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b406230a10be851a1eeb3ca7218084b04cf821c0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ed7fbb184af80211122be1643561d40a79ae278157fd8aeeb206018df778bf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6285c2e170f5b0a5f88175afc8992ea9b056765 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:895450fde1fe02b278d751361d1cdc00905179c2734094d931d6fc6be91bdd64 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e994069118d0fefaef50121e1d186aea70688f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476165855ad1567b52b353abe67105987165f50259bd9930d73d6a105b48a783 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3b21d106c1efcafb3c88f3d50510d9ce615d09c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88f628060589f0ab7cd5f96e0aeca6bef8f7d2a7616134e9d990d552dccf763 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/latest new file mode 100644 index 0000000000000000000000000000000000000000..3df30ded267d950ff3ca04cffb9660be12079ca6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/latest @@ -0,0 +1 @@ +global_step160 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e31a2394e12bf431ae13288c3d90fe4727f07fa7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6462d333dbc5bb5e497ea9b0adb960f7616f79e6eea63222de6d5bd559516 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1db0a0f44aa3ac1d82c3bf8dc2d8968eeba4ce7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b045e1bfa728f51c8b51ab0faa20b128a4fbd350da006b9b39a19e24abdf5a74 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..75de18f57a056bd6a5f89df1abd045678f3f919e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76a3d058d2628a61848c2441d313f251278bd8f74ce43dc44d8cd8ad3e619a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fd100693bc9f3267d044ce4a16e702502dc03ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f72fc498e6eaa671cdc0e8a627a668b8ef607063a22ddb4edbc05e791be830 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aeeabfe119f1cb0c8c804f1b9a4d3049f478d69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12889af98e175b734a788f4c5b8c4da91dd61ff3a05aaf61b9d4c66aa3dd8ad6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fe0f42382ab06f4d26d753745a914c9e46100e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe21a86abfceeac2cf2f48afd61a9a506cf61a287f3403f1adf391bb2ffa5a83 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5830ca6bd04645962b6e56a00a91cd8349ca449c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73488bec91f9dee6d8105d06f99edaf4d27b6b064250d4c7023f33285b2f3132 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..343d1c0475f0dc64100dc67b09195e047f1a7bcf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf6ee1cc2e1325b428a21172ec4e61b7220c5489751ea11c06bb66c77a0cd08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..229789af83e72e748f236450e9d2df977318d98a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b659f5e1f39ab526587d47a9d305eeca96cdb1335d25ff0a7b9958f685604b4 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5cf17e41be787c49653c5ea3910010050e5b12c0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/trainer_state.json @@ -0,0 +1,763 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 3.2, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 461344443203584.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-160/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0d2a5a7eba98f38116f31c35dfa99ff098dd717 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d50b0f96bc5e733b0e040078a1d57f4c8c63cc77f068b42f91465fa4d65cabc +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b2b046c441e6afece9cf9d28c83f6a300fe085 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bc65c8dde3f5b58a6155c43f7d5a6c110ad9148dfe50c3c597aa5eb22aacf7f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5448298b5e6115032a441373c0d8fbdaf53cd6b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec9ffb5d5d4c606d2374f843b26120d908f2502587ed9cc39f3531ed6590bb5 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a43fc7f0410c84bc891724e2f79ed2520d2ce25f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2406cf7e95d35ce16be5c7653219ef7545c9064d4faed530a88e11e90c61499f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2c8d5425e32d66272ddb2c602fe0050285d9b76 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5bf48b4f7c638bfa010106b24e7ad29b12aded8a9297c3d8b9c6867fac647a5 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f20698e3e23199457131cdde595112d0836fa312 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb65797b0df5b0746d06383e9feb5a8d6ff35e1aeeb8319a6628cab776834cdf +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2eed2dcecf2021cdc0ac69d622c021934dda571b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:377df54c1bec6740ece85af2cbd43eacab0edd0e6f8530c49949e2b3d7bc2e78 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e02f70a6a4eeec4a362f7931cd4a76b787994e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5b0e77aed422d273efdd5b37bd4bb6ae76773c4a5bfb2721e8378ba24dd69e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e80a040ccb0b66384c9b7611615abe0752777bce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f405d9453881adc10fe7ce566ebcb3ddaa4b618575359ea81f3ee061fb57c07 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce7402793537355fc2871447bc69ed9d94874de1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88da61dd1d2eadde0def04e82fb9c563190b82b4fada435767b043e9cb99e9c2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6861c5213a77f98d7ae04bf7fdf5544f18e38ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d896cf44f5ae5cfb0b0672aaa5d2f2bc7dc398e600579d7a563c1e12e5e8f868 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dbf57ca5528fae0d7dd4f887a42386490c74d3a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b99a62d11ca772c7128224795581b8e3dede5719839171afa4be80583c1b9963 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d381b635aadfa9d274f70f301ab5999d14d7361d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7805212658b287efc8164e7a8ff9ca1f75379dae7f732429753e67bdc995ba +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd0250e69ecb6ad2b478b783a98ae56a51eefb31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58f8c8180c503b0557b6daa834a7c7acb3fb718c2e9073e433664bf7d023d50f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61c3eb94cb68cdb8e5713e23b28481bac46e91ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c23ca0fb63e716d16fbe0b04909afe92b09d6293237ee53a7d4973a388fd20 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7730a09d64d14b9556a34e9555e1a2eb3445d9d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f56560f4e984835705279e3f77f7ff5b3859b4dc81de22eb106b902641eef4f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eececf14bb3a4e77c4c7cdcd716ea2c68a56c1e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b1e2b044fbcaa3c03b8be6d0c38ea38881719091425c350d96e9db2b9e7a5d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/latest new file mode 100644 index 0000000000000000000000000000000000000000..eac7d625396c2750025575c77b8da5d622b0c7dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/latest @@ -0,0 +1 @@ +global_step180 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f51b498d48145bd9cc14b35f8236b9ec95a4f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08e59ac81067b262a084604cd3392250166c2841 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20a24c17b4be2ee59cd5e6682010519318a91e58 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..54050f6cf8fb847e2a926e14a7aad2647761521a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..263aae475c49b090bce43f143308192c5bf9a95b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..942ed5d60ae87dce686b33da76a34db404036dc6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..57789be3df3983cb8acc1500bf6470ffadb1c578 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32d6e2e7eb7148713b473b0c821a98e616ab6e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18942cfbbbc36710e196a20b862a745c9dcc2468 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fa6cf7ac608af8ab72180ce60dcfa61b0bf4eeab8e185f70f65a95b45e6b7a +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8fc658927761bd1b8da4c86ac5b46edfd4754c5c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/trainer_state.json @@ -0,0 +1,852 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 3.6, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.014697201946726, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.6953125, + "logps/chosen": -133.0, + "logps/rejected": -412.0, + "loss": 0.35755233764648436, + "memory(GiB)": 57.7, + "nll_loss": 0.150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 13.5, + "rewards/rejected": -1.1640625, + "step": 165, + "train_speed(iter/s)": 0.114154 + }, + { + "epoch": 3.4, + "grad_norm": 0.35082819529806103, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.53125, + "logits/rejected": -1.8671875, + "logps/chosen": -498.0, + "logps/rejected": -141.0, + "loss": 0.3598182678222656, + "memory(GiB)": 57.7, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 18.875, + "rewards/rejected": 3.0625, + "step": 170, + "train_speed(iter/s)": 0.113875 + }, + { + "epoch": 3.5, + "grad_norm": 0.4565042851234479, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.546875, + "logps/chosen": -306.0, + "logps/rejected": -430.0, + "loss": 0.3215118408203125, + "memory(GiB)": 57.7, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 13.625, + "rewards/rejected": 2.046875, + "step": 175, + "train_speed(iter/s)": 0.114047 + }, + { + "epoch": 3.6, + "grad_norm": 0.2804732100108321, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9765625, + "logps/chosen": -426.0, + "logps/rejected": -232.0, + "loss": 0.40310821533203123, + "memory(GiB)": 57.7, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 15.9375, + "rewards/rejected": 1.734375, + "step": 180, + "train_speed(iter/s)": 0.114563 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4326171875, + "eval_nll_loss": 0.34765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 9.9375, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7191, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 519058559074304.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-180/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..277ec26bcd857b897d5ba4101837fed9aae44c36 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b618741f409bf2811d7d65d16806a95d67fce68193709e81d112954b381a146 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c27a8d34ef1995c5d4d89263017ea4cd5ad60a3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4899b645c7804a2388df48d20efe96a4b59218172a277bfef68f41941747da97 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b7be61c2f0541959b25d6d8b18c0443e659d915 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7336fda381e2a33781ccf084985cf8ca7195bace83f8269c4615bd7080113dde +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fa856ae33a065c688a3d2dd4bc626abe3787bc5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8615cfc0771ac39b876437ac167db41cbdbab6e3c0692e84e3e1c46da0170dfd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4992ef0cfe05fa68eead2ec07465e26584ba67b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de08b96e9d2c1f890823c503c466c0ed5d2367452ec71253cfa313f3a8ce06b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c769742e39b6b13119c724ff99633c3d5d9c2884 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f6b5cbdf33826a2b17ec9ca42c86c0cd8d4301932d7c2190e7b814663cc6c7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba39a37fe7eb1d8673bbd2f65864c14b727bb258 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c2167aa0fe39a076b5ad79be201ee38b6526eccd767fbb110c37d88e632ecd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfbb6d92c490e447f3e814521f65d2fb70ea6566 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f71939bea64358adf4a618dbbaf2718f4bda4005c08675dbf64178e6859f5543 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61baadbcf64a7fc86c1a40105fb88d851a7b1e5b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92ff9a19d322c72dfbe3c54e9c7d38330a8b9bff347fa994efef75dea0689e7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71e6809710083fe9e4f43f4f24320ab837b4b2d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc32c30258d2e509d189a7fb90875c5854417eae2fc2c69faf195175c37bb12 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df4cf8107b9a471b5634be66dbaf87b0ddccc0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52500ee3ca3c11c5b445ab093cf8153c3fff876beee85b0abd10261d911602c7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c74002e4e7e77b8d2379cd0fc0051218a501eef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8782f491d6fb02bb2e55278718c1f43545f2b0854ea7d67dbdf8b2e329462c14 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..362fa577140554954bdf284c91ce9280351ab4b7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae71a5ab33f00bdcd484910f31edda017814c9cafade2221d327f59b4a096e70 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e48629d032c86a46eff6dc97fa0d157e6ea400c8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d3824c22ef06bba7b85ad2be38b5269f76c72eee929e9ffca7d0475f0bdd3f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5914584e64f6b0b05de05207b1abfb5ae081c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c75fa988ef5168c28fc1e2d5f6aa773f62a6d4ee47cb09d5b6c0edb3e009325 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9e00fcbe7c3e8396f4b9c17f08b813c8edb64ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb900f2674600b021e706038cca71fb2f23fa4ef4bdcea1e79d46e83150d85be +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21854f7478406ebcd42308f8d76481466e204436 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46405c8ff8635574e3d845efc734f33cf5a7ac4e72e9ec8de87bb6995bf2970 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2729ff9a97436d6c8ad743637f529065140ad3f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e1cf73eea4791075e839e628da180bf39e1e01fcc1630f4ac9c723d8793968 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c8cd1a0a7e196d9ba62f0f302512e2549e51e95b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.76074219, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20", + "epoch": 0.4, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 58463755960320.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d030437f38a1de5eb26d6b5ef4c71985daca25f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537e2b2d0429504a120485480f145e07e16833b19a4d8fc05b0dba311a3132bd +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..221bcc722f4ac08ea309334abbd41a157574b4c2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c6efe81bfcf3ea9aa7962bddfde5727346040ad6b4c44b545083ecac8a1c34 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..866083d9ef17a7a042332430dbcad8e813c79df1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17ce1a480f6b1552b4e0b96fd00a691a9b6f18c5c7e559d54950c184040fbb7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4428e3ca5bcbbe964f3a03afc8b096013d159531 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ad6f4fd37adeb5b5df7a0afeadef899bff02471fb46ad76bb3cd04e62b0b6c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02ea14106b83374d41cf1f00e36f0cded5376c71 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e3a723f7c4fff67538460e38af5b18b636fc462186a8585ac00f4a9869f0cb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56602f251f788b083eeed4d28b01fe071251021d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f6d3aad106348a050c7e89bd096e10707b1313ff20ab0854280bc987673fcb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e14959f3539460b28848f7a284ca360442ee1986 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131e44da6927e764acfe1128d2729e33dbc1476d3874c076ca4b66c5efb705c9 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff46a8d5553de41ec9445ecd4243f48a81420f75 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f993d13a222bbf4bbd5ee5d9462e159a8d98ef5c77b4e68bff16857cbe6d80 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93d1369bea5da65fd9c272fbd4aefbfe1c93966b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8a5bde47f1e74a0d3670f9eec2d6e995d8ffcf298f10d39def91c29823ccd8 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bde23f44b716ef5fcc6d6390ebdf0edc5b15b92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e33cde7b16859b5088f4ffd3845d8b48453bc84e09f7506ebffb9234610fe5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6bf47b04a0b89fed5ad1c50e85a4ff37b487d8f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd5cc584e21418e22cbbbd2d7ec48f6ec048c64002e0ee900b9af47af8dbe84 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93dbf664b416a8bd76825356c697a24be80adb0c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb0b97c861f9c1b765a07fc612a643ae9b975529c3e6a58fbbff5e56c8029d5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0271d603f4f65ede5f2293541eb3ec66210a36c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b064ada0f9c945ba8307572a411ca15e51a66bd356854dc8d164870a90556196 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..349488e8fb77bbbff083f02acfb8f80e05cc3b1b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7237dee3b29ceb881f3bea76e22e054294c64423b2ce152496587f546b4e3642 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59ad21e07ae6b3d93dbade2be79e362268e455d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dafaf936472fd312519629ae86377e05b4e8366a3fc1ae46ab7357f088e5b8d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d1a3db634dbcb4cce6a7bff3aa030847e0d39f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b62b423d79abaec10d71574dd8918809f6362cff22fc46e6cccda16ddf4a4ea +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5acb5f25acad0b7402adf58b560cf3083e3fc002 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f88406a16b5da116748d1b2e4b370def2120e4e9c689a12a815d2f60c4eb6d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/latest new file mode 100644 index 0000000000000000000000000000000000000000..753e24e10f3a2489150f458205cf759fd8b6081f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..584f4a4a43f100f35696d7314a633631af587f25 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891ffa7c7dae99113aa986d67278b52b8c57db55001dc3547a61f24569a34ee +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..05b027a867e5e9cebd446293ecff82cfb240cc76 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b92875cb04deec367605433847d1bda444b178b643d2da7ed9aaf738d232b4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..af98f0dfe2a5d89fbccf90df58246a0b078c7016 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f5f3338a05e325b5408a1cd0b6f5e5b10fad05fe479d63f44bec4cf18107d6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..715aa4a4ee3915f810fc2bacb2153eb8a0913781 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be749fea477a3867d44010631937e0d8f071ca5f9614f9795c92c7fa68833a6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bde70899833455b6ee4a99aff9388abc5ffe92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc4a5ea4532c621f4c8e9891117b2e597a7f005001e8b4f2a1b4da8c82bf964 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..90cdeaa2fe438098e9d95ddbc06c765e51af1e78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480f9fe7dd71b54d915b46162e34b780ba2467d5542115cc809dbca60b394c0e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd30529614c5be239cd9477af6bef0e313740b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11d982dcd813e82c2d97a5491ce9624cff2dd22e8655ea617ccef1fc1474470 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bed311094effd49cc2c89237c675f56eade157d1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73494fac3a001cba7cedd097b97f028d4c1d136ee6709214b0a7fe305e5b9089 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b08896e3e64039017a0606b43a6327f1f78848dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826281cb7f404c3805b9798147d05074dd208eac748e2052087055a015aaeaed +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..865786c61d5a5e3e73abdf832f8a48fbc84053b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/trainer_state.json @@ -0,0 +1,941 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 4.0, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.014697201946726, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.6953125, + "logps/chosen": -133.0, + "logps/rejected": -412.0, + "loss": 0.35755233764648436, + "memory(GiB)": 57.7, + "nll_loss": 0.150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 13.5, + "rewards/rejected": -1.1640625, + "step": 165, + "train_speed(iter/s)": 0.114154 + }, + { + "epoch": 3.4, + "grad_norm": 0.35082819529806103, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.53125, + "logits/rejected": -1.8671875, + "logps/chosen": -498.0, + "logps/rejected": -141.0, + "loss": 0.3598182678222656, + "memory(GiB)": 57.7, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 18.875, + "rewards/rejected": 3.0625, + "step": 170, + "train_speed(iter/s)": 0.113875 + }, + { + "epoch": 3.5, + "grad_norm": 0.4565042851234479, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.546875, + "logps/chosen": -306.0, + "logps/rejected": -430.0, + "loss": 0.3215118408203125, + "memory(GiB)": 57.7, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 13.625, + "rewards/rejected": 2.046875, + "step": 175, + "train_speed(iter/s)": 0.114047 + }, + { + "epoch": 3.6, + "grad_norm": 0.2804732100108321, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9765625, + "logps/chosen": -426.0, + "logps/rejected": -232.0, + "loss": 0.40310821533203123, + "memory(GiB)": 57.7, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 15.9375, + "rewards/rejected": 1.734375, + "step": 180, + "train_speed(iter/s)": 0.114563 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4326171875, + "eval_nll_loss": 0.34765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 9.9375, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7191, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.2935390673249648, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.8984375, + "logps/chosen": -412.0, + "logps/rejected": -183.0, + "loss": 0.35445404052734375, + "memory(GiB)": 57.7, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 15.1875, + "rewards/rejected": 2.296875, + "step": 185, + "train_speed(iter/s)": 0.114484 + }, + { + "epoch": 3.8, + "grad_norm": 0.3456175313601738, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -2.0625, + "logits/rejected": -1.828125, + "logps/chosen": -246.0, + "logps/rejected": -552.0, + "loss": 0.35137252807617186, + "memory(GiB)": 57.7, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.1875, + "rewards/rejected": 0.67578125, + "step": 190, + "train_speed(iter/s)": 0.11489 + }, + { + "epoch": 3.9, + "grad_norm": 0.35528088328631074, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.625, + "logps/chosen": -348.0, + "logps/rejected": -496.0, + "loss": 0.29118738174438474, + "memory(GiB)": 57.7, + "nll_loss": 0.33984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 16.625, + "rewards/rejected": 1.609375, + "step": 195, + "train_speed(iter/s)": 0.114611 + }, + { + "epoch": 4.0, + "grad_norm": 0.5975495665409197, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.59375, + "logits/rejected": -1.5078125, + "logps/chosen": -270.0, + "logps/rejected": -508.0, + "loss": 0.3332973480224609, + "memory(GiB)": 69.46, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.25, + "rewards/rejected": 1.859375, + "step": 200, + "train_speed(iter/s)": 0.114365 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.9296875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -8.5, + "eval_logps/rejected": -229.0, + "eval_loss": 0.43798828125, + "eval_nll_loss": 0.369140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.6821, + "eval_samples_per_second": 1.086, + "eval_steps_per_second": 0.272, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 575884546801664.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5dac54b9b19b9b815514ad3cc3c09821fedd8991 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109d218449fa2bb246c8d24cca08c7e4e3cc043b9fe32f33c1edb3dfd3a89d18 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08a96d22c1ab5aa6084071e8ec70a8f76f8a6187 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65bb9498bddd3ca7a0f4f6c0fecf01c233572bcd0b77cb9768297336f0229c7f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c45ad62b4f24d5d0d5ea726f4a3a648241f05c6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d21270dd1fbc149df078365f2e2badb787c941113445af8acd55ebc240d3c2 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..889759b140fc21302601c74fb66629baa38373ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1d142842a055b9cf7d30585c4f6526c6941be11ef70e8e2333df226c08bd6e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c06b65ac3cf1b816077bbeb5ebf4d8a4af119c07 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733d87d090ff9e0b67a39497950e06a6533a3cb6b56c55f6ab6068eaf25682b4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c1a00a73c999060bcff343f9934a216f45ab9f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b34a186649ac138690a634e07a17c624691c88365ec1d3ede82b658fc7122a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97c65ef8189fb5f1a14964019d169f08d9ae2400 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3933d3e5326d1374977324b138267152f2b54dfbb8062e4e02447766684a079 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34be7c0613e7022480d7e33031ef5f277172b0a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9d65c3578ae71f36b71f89e8e66f4c02dc4d8eaea26096b3a17cfd3c6fcb2ac +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85c1ae9d0047fd449d22fa62b82f4aca429fff60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:808b4e02f4ba44a773f9b0e0f081024493cb379a8a898a3b65de6a1fd322ac2c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5008545763215fdc50fc0d1d379597b5f24a8a2b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ad77c70c15202358a7ff7d6b1c13984b94df7e3ec953a112ddafbccd9bfd379 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..282e3dbb0373140b1da8c69eb2924ea0c53d2a43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f6f57fe76c52e8a35f234dca157c04b0e6d8729e2e355fc4df4e99ead34219 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92b737c9ba1961d1ac3fc17aadde348b5c8cc150 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59f16d65205d88957fca642dab82620d0cd32d7dfe4f8a2fc492cd387b27ce5e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c846ee202b383799ddb094c8701176c7a7dfc1e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe544dcab6c6a2cd20ff66c6d3181b5cbc88c105912866caedf7ae63ceb25729 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f197adb10a942a6c4de27fd6e8179d90d72820c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989016ff43e8eacc0926929f1690a0e292594a415551b8e6dfb4411a29d58ea6 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcb7eb0c9cea38504f9b7e203762cf56be3fac33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43b75f19ce28b0f91a783892b7cdb25d45491bbde542f96c472bb9d87d0e958 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43230214ac689a3045c81514337bb9de9af9561b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd96cd520bb0f23cca4152fa516968cf862567839f0059c1de73e4645f4f987 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71920eeac9666981c979f5d10ff4177cbb1bda5c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:866d74d3d2053941d63405d021855a3f8faf84868e471f06c8f0b3363b88374e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9ebe2709e7f014a6431e10a08b9ee83756b9b83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/latest @@ -0,0 +1 @@ +global_step220 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc26f1e85f4e8e85881b70bb37705b907a71e2da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192b6eaac6b92a2de7d039b2fc8b1f373bff6953e1e6a952189b56167078edd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43ba694169a5ef3484b4eba9acbec135f3b402b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/trainer_state.json @@ -0,0 +1,1030 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 4.4, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.014697201946726, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.6953125, + "logps/chosen": -133.0, + "logps/rejected": -412.0, + "loss": 0.35755233764648436, + "memory(GiB)": 57.7, + "nll_loss": 0.150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 13.5, + "rewards/rejected": -1.1640625, + "step": 165, + "train_speed(iter/s)": 0.114154 + }, + { + "epoch": 3.4, + "grad_norm": 0.35082819529806103, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.53125, + "logits/rejected": -1.8671875, + "logps/chosen": -498.0, + "logps/rejected": -141.0, + "loss": 0.3598182678222656, + "memory(GiB)": 57.7, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 18.875, + "rewards/rejected": 3.0625, + "step": 170, + "train_speed(iter/s)": 0.113875 + }, + { + "epoch": 3.5, + "grad_norm": 0.4565042851234479, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.546875, + "logps/chosen": -306.0, + "logps/rejected": -430.0, + "loss": 0.3215118408203125, + "memory(GiB)": 57.7, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 13.625, + "rewards/rejected": 2.046875, + "step": 175, + "train_speed(iter/s)": 0.114047 + }, + { + "epoch": 3.6, + "grad_norm": 0.2804732100108321, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9765625, + "logps/chosen": -426.0, + "logps/rejected": -232.0, + "loss": 0.40310821533203123, + "memory(GiB)": 57.7, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 15.9375, + "rewards/rejected": 1.734375, + "step": 180, + "train_speed(iter/s)": 0.114563 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4326171875, + "eval_nll_loss": 0.34765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 9.9375, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7191, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.2935390673249648, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.8984375, + "logps/chosen": -412.0, + "logps/rejected": -183.0, + "loss": 0.35445404052734375, + "memory(GiB)": 57.7, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 15.1875, + "rewards/rejected": 2.296875, + "step": 185, + "train_speed(iter/s)": 0.114484 + }, + { + "epoch": 3.8, + "grad_norm": 0.3456175313601738, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -2.0625, + "logits/rejected": -1.828125, + "logps/chosen": -246.0, + "logps/rejected": -552.0, + "loss": 0.35137252807617186, + "memory(GiB)": 57.7, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.1875, + "rewards/rejected": 0.67578125, + "step": 190, + "train_speed(iter/s)": 0.11489 + }, + { + "epoch": 3.9, + "grad_norm": 0.35528088328631074, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.625, + "logps/chosen": -348.0, + "logps/rejected": -496.0, + "loss": 0.29118738174438474, + "memory(GiB)": 57.7, + "nll_loss": 0.33984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 16.625, + "rewards/rejected": 1.609375, + "step": 195, + "train_speed(iter/s)": 0.114611 + }, + { + "epoch": 4.0, + "grad_norm": 0.5975495665409197, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.59375, + "logits/rejected": -1.5078125, + "logps/chosen": -270.0, + "logps/rejected": -508.0, + "loss": 0.3332973480224609, + "memory(GiB)": 69.46, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.25, + "rewards/rejected": 1.859375, + "step": 200, + "train_speed(iter/s)": 0.114365 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.9296875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -8.5, + "eval_logps/rejected": -229.0, + "eval_loss": 0.43798828125, + "eval_nll_loss": 0.369140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.6821, + "eval_samples_per_second": 1.086, + "eval_steps_per_second": 0.272, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3331703215396951, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.6875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.3664663314819336, + "memory(GiB)": 69.46, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 13.5, + "rewards/rejected": 2.046875, + "step": 205, + "train_speed(iter/s)": 0.113733 + }, + { + "epoch": 4.2, + "grad_norm": 0.3665381220661868, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -255.0, + "logps/rejected": -192.0, + "loss": 0.3193946838378906, + "memory(GiB)": 69.46, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.875, + "rewards/margins": 14.8125, + "rewards/rejected": 3.109375, + "step": 210, + "train_speed(iter/s)": 0.113959 + }, + { + "epoch": 4.3, + "grad_norm": 0.424301607874042, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5234375, + "logps/chosen": -149.0, + "logps/rejected": -892.0, + "loss": 0.3307832717895508, + "memory(GiB)": 69.46, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 13.0, + "rewards/rejected": 0.002349853515625, + "step": 215, + "train_speed(iter/s)": 0.114016 + }, + { + "epoch": 4.4, + "grad_norm": 0.22292040670451319, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.53125, + "logps/chosen": -81.5, + "logps/rejected": -780.0, + "loss": 0.30391464233398435, + "memory(GiB)": 69.46, + "nll_loss": 0.1484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 11.6875, + "rewards/rejected": 0.1201171875, + "step": 220, + "train_speed(iter/s)": 0.113876 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.0625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4443359375, + "eval_nll_loss": 0.39453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7528, + "eval_samples_per_second": 1.066, + "eval_steps_per_second": 0.266, + "step": 220 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 636032434831360.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-220/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bb79ce2ee5e337d4986ca06dc36b8f91aafd00f8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c98dca4c0611c3773be43adbc07a01ef20530ceaa999b5d8f834d3cfaf3fb187 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c310fa5e1135a512cbb76e39ff09fe5a1f4ffc8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349087dd7de215b1c2f32dff1019e68d9211c5f61a5ad1e54aea0da97f19fd7e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae7007186ef32a4ed1b4e4a41f892106d86107d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6232370f7e9491dcf3ace3043fe2f581a295326dd0f8bf69bcc27a926d45900 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..015caf3da26942e94c7051efb7ec0dedc75debbf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab59bf6c34e74f4d2c7a7f1291fc6e54e3500680368e399e7ca57257fd734d44 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34a9d3c15710aa4098b8a5eff97b8dab6067fdbd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d17c145f40bb60ecce1671f9605b9135d2c8919a810487bc9a10b7930ba1bdf2 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0ed71606a78b259cb044720dd57c0e38a4cf595 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91bb3ad418d16a0f61f1ae4c62f57c98c1aceaaa374a8f229e3670a08c10c7eb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00d0da6a360e983dafefef426681e04c8aca480c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31430fd23d724b1e2bcd742baf86a22636a3bc913f54853525c0ea2bfa105486 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dcbf97e1b53a8021fc29de75001a447e0da0945 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb6c35c736b30252ee018fdfbaedd30f783a161bb5f2b052211ca71f57cb32f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a317b8e0372fa4d60681f65c348857abe15b0893 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f2bf706c87dc70de393d8ad6cd6d975cf197ae59cf1f8c2f5612653f9801f3 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7206233e292811c1fb8903f3a2d8770a9b770e79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02604cded14cd2d79957c721d05827970ba405895a73fe52e20900e6ed29ef33 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..905a4fca3ed9c3b213a7572dda6b448b2d5fa7cf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e535dab82d1041cedb024955c508d5ae4309dea10118d364308e96b4b3be15 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3e2773c79471caacdd8feb060b17dc30e450c1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db22f5469e1617c402491fd8895eb803b15fb5f1c081edebb47b1e3e36280c95 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec9e7b45781caf41ba4ce0245bc2df6efdf30cfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0866ffb98c3cf04d78b0b6810beb088a8bbdf6fb5f9fd749ba3f6eebd7e25a09 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4706b5e75c91f1487b9a1cd764b35797ffa72f08 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4573283ab9c550879810ceaa24b2af427002a5344fafffad523593738c6f3699 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37b0caddbf1da86d62ff0150ff7df116cb3fb272 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6577905b359d657064d8502259bb64d9c14c8c36fdb2462a8e23cd491e5079bc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9038f23f62ee384c72ca4adcd28f87704ddd8a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcf3531a040f9a558f8ed1527d3d9f6a0aa255cac84a4cf3bbfe0d77a7bf6564 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dac544c45d5dd20ea5ad34f8e0c30dfc7531450 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b5a6aefd5b4b41357dbc7622b7284276bdd6dd5b67712ea49602271d8fe9c9 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/latest new file mode 100644 index 0000000000000000000000000000000000000000..161e63cf7292b2184098d115f0621d2ed09e86c5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/latest @@ -0,0 +1 @@ +global_step240 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3a6ea45dd4e59b9683f66476f460fa0c77a9d66 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0c9979566a5d89cb3c766336548670ec6f2291deba1b7ab1764c12d3187b24 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42e6b0d6985c9b3f0cec701759e0b3d671c77abd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e36a570d6158fc25d1cf5d9f8f450fc64c5a7683330277f89ff76d5f2fc6cd +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..376994a32199299a2a48b62753947cdb1f7ad72a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f619cbef4b74f1680d667c8788285a602392e63bdf3760ef3a59ec8864d483 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f1edb2dfec55e5cbead7ae3d14351c3650c4f77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc037fba93ace1bf7ce01b1a5f7d785698d47b4cc2cedf2300bbf7a41ebf05c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..016d34db4ec6597c207021d026234c9692c3f3ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab728c2461d6d1c64f04d7cbfdfcbfa7bd7ad0ef6e19d52458501ee81b27128 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d7824c2bd9e8b1cec7f0d84d673017b0da62e43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27530e653ebf5997ae3159cdcde264607e6a6f86b7e3c7a1b3a1e8301cd43d03 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f41ee261ad98d2d0eb8f09167a5b32604513b56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fddaeb1257697bd7c0101abf1ab23f2925d0d9165cd8bddfbd22f8444db2b7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8816834cc1c0e822e11a8df138fa41557f3a0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942af3734a320fe12a3205a47ca1cdc7d1f0996bfde86c020a35545ccd2fd418 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce5faf9896aeadd65d47acddb4b510a6fc3c65f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a46b33bfe1e26ebea81904070b93f8e7376ae49add370042b1998521eed8ba +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4c47ef0286186091bb677e1182db4ea1728f2609 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/trainer_state.json @@ -0,0 +1,1119 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 4.8, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.014697201946726, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.6953125, + "logps/chosen": -133.0, + "logps/rejected": -412.0, + "loss": 0.35755233764648436, + "memory(GiB)": 57.7, + "nll_loss": 0.150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 13.5, + "rewards/rejected": -1.1640625, + "step": 165, + "train_speed(iter/s)": 0.114154 + }, + { + "epoch": 3.4, + "grad_norm": 0.35082819529806103, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.53125, + "logits/rejected": -1.8671875, + "logps/chosen": -498.0, + "logps/rejected": -141.0, + "loss": 0.3598182678222656, + "memory(GiB)": 57.7, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 18.875, + "rewards/rejected": 3.0625, + "step": 170, + "train_speed(iter/s)": 0.113875 + }, + { + "epoch": 3.5, + "grad_norm": 0.4565042851234479, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.546875, + "logps/chosen": -306.0, + "logps/rejected": -430.0, + "loss": 0.3215118408203125, + "memory(GiB)": 57.7, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 13.625, + "rewards/rejected": 2.046875, + "step": 175, + "train_speed(iter/s)": 0.114047 + }, + { + "epoch": 3.6, + "grad_norm": 0.2804732100108321, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9765625, + "logps/chosen": -426.0, + "logps/rejected": -232.0, + "loss": 0.40310821533203123, + "memory(GiB)": 57.7, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 15.9375, + "rewards/rejected": 1.734375, + "step": 180, + "train_speed(iter/s)": 0.114563 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4326171875, + "eval_nll_loss": 0.34765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 9.9375, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7191, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.2935390673249648, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.8984375, + "logps/chosen": -412.0, + "logps/rejected": -183.0, + "loss": 0.35445404052734375, + "memory(GiB)": 57.7, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 15.1875, + "rewards/rejected": 2.296875, + "step": 185, + "train_speed(iter/s)": 0.114484 + }, + { + "epoch": 3.8, + "grad_norm": 0.3456175313601738, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -2.0625, + "logits/rejected": -1.828125, + "logps/chosen": -246.0, + "logps/rejected": -552.0, + "loss": 0.35137252807617186, + "memory(GiB)": 57.7, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.1875, + "rewards/rejected": 0.67578125, + "step": 190, + "train_speed(iter/s)": 0.11489 + }, + { + "epoch": 3.9, + "grad_norm": 0.35528088328631074, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.625, + "logps/chosen": -348.0, + "logps/rejected": -496.0, + "loss": 0.29118738174438474, + "memory(GiB)": 57.7, + "nll_loss": 0.33984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 16.625, + "rewards/rejected": 1.609375, + "step": 195, + "train_speed(iter/s)": 0.114611 + }, + { + "epoch": 4.0, + "grad_norm": 0.5975495665409197, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.59375, + "logits/rejected": -1.5078125, + "logps/chosen": -270.0, + "logps/rejected": -508.0, + "loss": 0.3332973480224609, + "memory(GiB)": 69.46, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.25, + "rewards/rejected": 1.859375, + "step": 200, + "train_speed(iter/s)": 0.114365 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.9296875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -8.5, + "eval_logps/rejected": -229.0, + "eval_loss": 0.43798828125, + "eval_nll_loss": 0.369140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.6821, + "eval_samples_per_second": 1.086, + "eval_steps_per_second": 0.272, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3331703215396951, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.6875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.3664663314819336, + "memory(GiB)": 69.46, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 13.5, + "rewards/rejected": 2.046875, + "step": 205, + "train_speed(iter/s)": 0.113733 + }, + { + "epoch": 4.2, + "grad_norm": 0.3665381220661868, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -255.0, + "logps/rejected": -192.0, + "loss": 0.3193946838378906, + "memory(GiB)": 69.46, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.875, + "rewards/margins": 14.8125, + "rewards/rejected": 3.109375, + "step": 210, + "train_speed(iter/s)": 0.113959 + }, + { + "epoch": 4.3, + "grad_norm": 0.424301607874042, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5234375, + "logps/chosen": -149.0, + "logps/rejected": -892.0, + "loss": 0.3307832717895508, + "memory(GiB)": 69.46, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 13.0, + "rewards/rejected": 0.002349853515625, + "step": 215, + "train_speed(iter/s)": 0.114016 + }, + { + "epoch": 4.4, + "grad_norm": 0.22292040670451319, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.53125, + "logps/chosen": -81.5, + "logps/rejected": -780.0, + "loss": 0.30391464233398435, + "memory(GiB)": 69.46, + "nll_loss": 0.1484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 11.6875, + "rewards/rejected": 0.1201171875, + "step": 220, + "train_speed(iter/s)": 0.113876 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.0625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4443359375, + "eval_nll_loss": 0.39453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7528, + "eval_samples_per_second": 1.066, + "eval_steps_per_second": 0.266, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.30310486108061985, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.8828125, + "logps/chosen": -524.0, + "logps/rejected": -197.0, + "loss": 0.34355936050415037, + "memory(GiB)": 69.46, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.875, + "rewards/margins": 20.75, + "rewards/rejected": 3.171875, + "step": 225, + "train_speed(iter/s)": 0.113594 + }, + { + "epoch": 4.6, + "grad_norm": 0.3741919235199971, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": -1.7421875, + "logits/rejected": -1.703125, + "logps/chosen": -252.0, + "logps/rejected": -676.0, + "loss": 0.3126819133758545, + "memory(GiB)": 69.46, + "nll_loss": 0.35546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.125, + "rewards/margins": 15.25, + "rewards/rejected": 1.8046875, + "step": 230, + "train_speed(iter/s)": 0.113544 + }, + { + "epoch": 4.7, + "grad_norm": 0.47750186161501185, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -1.875, + "logits/rejected": -1.640625, + "logps/chosen": -179.0, + "logps/rejected": -592.0, + "loss": 0.2852222442626953, + "memory(GiB)": 69.46, + "nll_loss": 0.2392578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.3125, + "rewards/margins": 14.125, + "rewards/rejected": -0.8515625, + "step": 235, + "train_speed(iter/s)": 0.113578 + }, + { + "epoch": 4.8, + "grad_norm": 0.28821358712762607, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -2.0, + "logits/rejected": -1.9453125, + "logps/chosen": -380.0, + "logps/rejected": -588.0, + "loss": 0.32334194183349607, + "memory(GiB)": 69.46, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 16.25, + "rewards/rejected": 2.828125, + "step": 240, + "train_speed(iter/s)": 0.113901 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.9140625, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.3125, + "eval_logps/rejected": -229.0, + "eval_loss": 0.44677734375, + "eval_nll_loss": 0.404296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.75, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.7327, + "eval_samples_per_second": 1.072, + "eval_steps_per_second": 0.268, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 692440788369408.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-240/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d93f43fd04e084ed4324e6c560ea850a8cd2ca2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d05036696a509f6eca240a2a3d355656def6cda6079ba85156171149d52f56a +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b189c6921db89a000038e9e5b40cb898260c410 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635f4262fccf8d1e36717109544b207886158489457e62c066221602480807fe +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1449c395b45e11de09b3b3e5dd9532ab858ef913 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b27e4310cb94936917abae002446191372a2aceaff33aebbedb67b5db740e34e +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30e543e48eb3aaf171f125fdfb50068bbec26ae7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb371ae1e033444902251629f1fe1068f89f34861b3813439d8dc09573d05af4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7665383143b14f7613d209757814fb6e61861ab5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e657622b1c30369153f2bed3041c4429f11192fd85411d1e97f9648b26cdd4d4 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40fc97ab96b2fcdd60b3e3a0af22fb9e79e7c07c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31736d4c880a79cca14ce6f7e2d095e2f5a68fdc7b5f2240bdb1e06e9489bc5f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1ca81bb95f1204e563f92e458db9f9d4c6ce9af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e3889c7bedd57a43ef3c557ac78d20e64ac065b5811490d30d76af1d7c3793 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c727e09a1be34f34b73bb9694118dbe88d92a486 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab3d54cefcc31f4b4f07b7936cc05d22016995bd90bf6f3057ae676cb4cdebd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7181e8186deca540b88308299100023a95f76c79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36c4b154e90ccb7cfe7746079f0c070c5a76f1ced7d7e417f089c1a804c8641 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74c72595344e47d125eee40af57b1f2649b56f4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:402309bdc3ad5853e48f3ef675a9d40d3a415e4fd0142893c49a69df6f055314 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e82ec0902114f99a626f493c97be4bed74a33a2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d14e0cdecbd47f30ce23c03790cb72e556beeb8e8743e760ac502b0b5a88fc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86dbbf0c0e1448a36ca86dabe1c03c3c73d80299 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab93cfa6f04c0f56ec75d10f12cc26a3afd6807507abf85bf7836417cf8e5eb5 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24840728efea46dd4f1b9d2890d1d6418c54d5c9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce4ffd58d54fef5848a0455eea663fd72b371741f5d7ce93df05be02e982ed00 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dd6c3a526c80c184698a2bf3a084d952376d7f0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6da7e85b61a09c3872fe5e2bb7776ff50db9724141b99238b261f5bd8b0b04e3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73f670d36fdb6675b12817e9a25e3d3c4d9f3615 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a2714e02715e4e558224e47b6461252e3a4aafe9c42d9a4c5474424fb44b32b +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..461c06b430d8736579badfe588825138ee056ff4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15062182dd944e2770bbd3661a15f0ad423a617adac5c9400b175e56124b0dcf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9dbb8b98ce6607964fd48afa3b50d981fa031a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a953f07ad1f6d0cbe70ea9a3074c82f719ef703588f08ad405efd7301002fb6 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/latest new file mode 100644 index 0000000000000000000000000000000000000000..87449ff1a854ba4a77ea33fbc24adaed3311d6b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/latest @@ -0,0 +1 @@ +global_step250 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab29abc7c5c196288fd5c119c67c4f655f27d44c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5c4738c31c5c9a38e1f586256d59a0e8e7d02641b9b9af2afdbe078440aeb4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8e0ba47a098b34da66857368b41c80a5d9d796f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d374b3390eb52ec7f6161c06272d4f26cb715692bdf2ad5374287b6de420ca3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7676e48e7dd332be5f46585fc5f824c5791f76ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24111edc5a6a2994166cd410155ee3c630816d0fe21c13808ebd2a2ae45bc9d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..228202ae722c05ed5fafc13eeac33a8a2685cca5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157b21eda1c7f898e519251deed08049767ffba123797289de56343a92ba7380 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a63de21fa3e29782ced5828f8f34fba46bad33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb615552e5845759bc13aa2ae50c0525fbf941fa76ee2e2c20cb9838fe1995 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d487727115f1120e55e91ad9583fb23ff8e34083 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf720fc22147ce563d6f2c2f6f3d916a7e8b7af174b480d072b5c822e992aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90628d8fd79ee2a98fb904251b6d7938f5120b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d055d3b033dc8e6fc2a19aa95162960544ab94a903988874315efe4ed5aa8e13 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e1556a7ec04e7309f4c9130351c880ef6a0626 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e03c685f2e019350bfdd41f006495a18690aacbccd7ffc1f40de827f433eb87 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..871b4a6cbd60ea4b2ef2416f3a46bbe632ddb667 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b1af2ae92a304371e36f6c1b7001f5dafc395be0b17c480957fc7fb58d8cd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d09fa3fca5b7d1a56ad0b17bed9cebcf268fc9e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/trainer_state.json @@ -0,0 +1,1172 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.3760194352348836, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.671875, + "logps/chosen": -237.0, + "logps/rejected": -620.0, + "loss": 0.5662841796875, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 10.75, + "rewards/rejected": 1.5234375, + "step": 85, + "train_speed(iter/s)": 0.115467 + }, + { + "epoch": 1.8, + "grad_norm": 0.31117947132159346, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.8046875, + "logps/chosen": -240.0, + "logps/rejected": -426.0, + "loss": 0.4222900390625, + "memory(GiB)": 57.7, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 12.125, + "rewards/rejected": 0.96875, + "step": 90, + "train_speed(iter/s)": 0.114775 + }, + { + "epoch": 1.9, + "grad_norm": 0.3346492838768647, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -1.796875, + "logits/rejected": -1.4921875, + "logps/chosen": -286.0, + "logps/rejected": -446.0, + "loss": 0.4668212890625, + "memory(GiB)": 57.7, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 12.625, + "rewards/rejected": 2.375, + "step": 95, + "train_speed(iter/s)": 0.114788 + }, + { + "epoch": 2.0, + "grad_norm": 0.2695636678577293, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -1.7578125, + "logits/rejected": -1.609375, + "logps/chosen": -222.0, + "logps/rejected": -288.0, + "loss": 0.4191619873046875, + "memory(GiB)": 57.7, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.0, + "rewards/margins": 10.8125, + "rewards/rejected": 1.15625, + "step": 100, + "train_speed(iter/s)": 0.114564 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.8515625, + "eval_logits/rejected": -1.5859375, + "eval_logps/chosen": -7.25, + "eval_logps/rejected": -223.0, + "eval_loss": 0.42919921875, + "eval_nll_loss": 0.314453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.8125, + "eval_rewards/margins": 9.3125, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7095, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.27, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.23160497773181857, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -1.8515625, + "logits/rejected": -1.796875, + "logps/chosen": -366.0, + "logps/rejected": -540.0, + "loss": 0.4347412109375, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 11.3125, + "rewards/rejected": 3.765625, + "step": 105, + "train_speed(iter/s)": 0.113576 + }, + { + "epoch": 2.2, + "grad_norm": 0.32495458997078863, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -1.5390625, + "logits/rejected": -1.453125, + "logps/chosen": -332.0, + "logps/rejected": -824.0, + "loss": 0.42264404296875, + "memory(GiB)": 57.7, + "nll_loss": 0.37890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.625, + "rewards/margins": 10.4375, + "rewards/rejected": 3.171875, + "step": 110, + "train_speed(iter/s)": 0.113607 + }, + { + "epoch": 2.3, + "grad_norm": 0.1698986411617881, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -1.7734375, + "logits/rejected": -1.6328125, + "logps/chosen": -414.0, + "logps/rejected": -576.0, + "loss": 0.447998046875, + "memory(GiB)": 57.7, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 13.0, + "rewards/rejected": 2.96875, + "step": 115, + "train_speed(iter/s)": 0.113793 + }, + { + "epoch": 2.4, + "grad_norm": 0.2858000402116111, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -1.828125, + "logits/rejected": -1.765625, + "logps/chosen": -312.0, + "logps/rejected": -286.0, + "loss": 0.369580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 11.625, + "rewards/rejected": 1.53125, + "step": 120, + "train_speed(iter/s)": 0.114561 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -224.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.7647, + "eval_samples_per_second": 1.062, + "eval_steps_per_second": 0.266, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.30453527889308935, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -2.140625, + "logits/rejected": -1.703125, + "logps/chosen": -43.25, + "logps/rejected": -920.0, + "loss": 0.4149017333984375, + "memory(GiB)": 57.7, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 10.9375, + "rewards/rejected": -0.59765625, + "step": 125, + "train_speed(iter/s)": 0.114475 + }, + { + "epoch": 2.6, + "grad_norm": 0.4249608562032297, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8671875, + "logps/chosen": -330.0, + "logps/rejected": -238.0, + "loss": 0.4016357421875, + "memory(GiB)": 57.7, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 14.75, + "rewards/rejected": 1.8984375, + "step": 130, + "train_speed(iter/s)": 0.115111 + }, + { + "epoch": 2.7, + "grad_norm": 0.4079479376826424, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.7734375, + "logps/chosen": -127.5, + "logps/rejected": -516.0, + "loss": 0.37172470092773435, + "memory(GiB)": 57.7, + "nll_loss": 0.216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0, + "rewards/margins": 10.6875, + "rewards/rejected": 0.33984375, + "step": 135, + "train_speed(iter/s)": 0.115369 + }, + { + "epoch": 2.8, + "grad_norm": 0.2037186119283533, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -1.96875, + "logits/rejected": -1.8359375, + "logps/chosen": -290.0, + "logps/rejected": -580.0, + "loss": 0.39625892639160154, + "memory(GiB)": 57.7, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 13.25, + "rewards/rejected": 0.91796875, + "step": 140, + "train_speed(iter/s)": 0.115894 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.8984375, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -224.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": -0.6015625, + "eval_runtime": 3.6938, + "eval_samples_per_second": 1.083, + "eval_steps_per_second": 0.271, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.663146454101917, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.84375, + "logps/chosen": -482.0, + "logps/rejected": -532.0, + "loss": 0.438983154296875, + "memory(GiB)": 57.7, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.8125, + "rewards/margins": 11.8125, + "rewards/rejected": 3.015625, + "step": 145, + "train_speed(iter/s)": 0.114627 + }, + { + "epoch": 3.0, + "grad_norm": 0.35248744382044206, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -1.5546875, + "logits/rejected": -1.8671875, + "logps/chosen": -348.0, + "logps/rejected": -235.0, + "loss": 0.337396240234375, + "memory(GiB)": 57.7, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 16.0, + "rewards/rejected": 3.125, + "step": 150, + "train_speed(iter/s)": 0.114227 + }, + { + "epoch": 3.1, + "grad_norm": 0.22136084702555914, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -1.7265625, + "logits/rejected": -1.78125, + "logps/chosen": -444.0, + "logps/rejected": -490.0, + "loss": 0.41593017578125, + "memory(GiB)": 57.7, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.875, + "rewards/margins": 14.625, + "rewards/rejected": 2.21875, + "step": 155, + "train_speed(iter/s)": 0.114508 + }, + { + "epoch": 3.2, + "grad_norm": 0.27404829822114873, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -1.859375, + "logits/rejected": -2.046875, + "logps/chosen": -380.0, + "logps/rejected": -306.0, + "loss": 0.378253173828125, + "memory(GiB)": 57.7, + "nll_loss": 0.365234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 14.8125, + "rewards/rejected": 2.9375, + "step": 160, + "train_speed(iter/s)": 0.114379 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -6.9375, + "eval_logps/rejected": -230.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.30078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 10.0625, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7366, + "eval_samples_per_second": 1.07, + "eval_steps_per_second": 0.268, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 1.014697201946726, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.6953125, + "logps/chosen": -133.0, + "logps/rejected": -412.0, + "loss": 0.35755233764648436, + "memory(GiB)": 57.7, + "nll_loss": 0.150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 13.5, + "rewards/rejected": -1.1640625, + "step": 165, + "train_speed(iter/s)": 0.114154 + }, + { + "epoch": 3.4, + "grad_norm": 0.35082819529806103, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": -1.53125, + "logits/rejected": -1.8671875, + "logps/chosen": -498.0, + "logps/rejected": -141.0, + "loss": 0.3598182678222656, + "memory(GiB)": 57.7, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 18.875, + "rewards/rejected": 3.0625, + "step": 170, + "train_speed(iter/s)": 0.113875 + }, + { + "epoch": 3.5, + "grad_norm": 0.4565042851234479, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.546875, + "logps/chosen": -306.0, + "logps/rejected": -430.0, + "loss": 0.3215118408203125, + "memory(GiB)": 57.7, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.625, + "rewards/margins": 13.625, + "rewards/rejected": 2.046875, + "step": 175, + "train_speed(iter/s)": 0.114047 + }, + { + "epoch": 3.6, + "grad_norm": 0.2804732100108321, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -1.7890625, + "logits/rejected": -1.9765625, + "logps/chosen": -426.0, + "logps/rejected": -232.0, + "loss": 0.40310821533203123, + "memory(GiB)": 57.7, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 15.9375, + "rewards/rejected": 1.734375, + "step": 180, + "train_speed(iter/s)": 0.114563 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.0, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4326171875, + "eval_nll_loss": 0.34765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.75, + "eval_rewards/margins": 9.9375, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7191, + "eval_samples_per_second": 1.076, + "eval_steps_per_second": 0.269, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.2935390673249648, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -1.671875, + "logits/rejected": -1.8984375, + "logps/chosen": -412.0, + "logps/rejected": -183.0, + "loss": 0.35445404052734375, + "memory(GiB)": 57.7, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 15.1875, + "rewards/rejected": 2.296875, + "step": 185, + "train_speed(iter/s)": 0.114484 + }, + { + "epoch": 3.8, + "grad_norm": 0.3456175313601738, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -2.0625, + "logits/rejected": -1.828125, + "logps/chosen": -246.0, + "logps/rejected": -552.0, + "loss": 0.35137252807617186, + "memory(GiB)": 57.7, + "nll_loss": 0.53515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.875, + "rewards/margins": 15.1875, + "rewards/rejected": 0.67578125, + "step": 190, + "train_speed(iter/s)": 0.11489 + }, + { + "epoch": 3.9, + "grad_norm": 0.35528088328631074, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.625, + "logps/chosen": -348.0, + "logps/rejected": -496.0, + "loss": 0.29118738174438474, + "memory(GiB)": 57.7, + "nll_loss": 0.33984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 16.625, + "rewards/rejected": 1.609375, + "step": 195, + "train_speed(iter/s)": 0.114611 + }, + { + "epoch": 4.0, + "grad_norm": 0.5975495665409197, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -1.59375, + "logits/rejected": -1.5078125, + "logps/chosen": -270.0, + "logps/rejected": -508.0, + "loss": 0.3332973480224609, + "memory(GiB)": 69.46, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.25, + "rewards/rejected": 1.859375, + "step": 200, + "train_speed(iter/s)": 0.114365 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.9296875, + "eval_logits/rejected": -1.65625, + "eval_logps/chosen": -8.5, + "eval_logps/rejected": -229.0, + "eval_loss": 0.43798828125, + "eval_nll_loss": 0.369140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.6821, + "eval_samples_per_second": 1.086, + "eval_steps_per_second": 0.272, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.3331703215396951, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.6875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.3664663314819336, + "memory(GiB)": 69.46, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 13.5, + "rewards/rejected": 2.046875, + "step": 205, + "train_speed(iter/s)": 0.113733 + }, + { + "epoch": 4.2, + "grad_norm": 0.3665381220661868, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -1.7890625, + "logits/rejected": -1.7734375, + "logps/chosen": -255.0, + "logps/rejected": -192.0, + "loss": 0.3193946838378906, + "memory(GiB)": 69.46, + "nll_loss": 0.31640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.875, + "rewards/margins": 14.8125, + "rewards/rejected": 3.109375, + "step": 210, + "train_speed(iter/s)": 0.113959 + }, + { + "epoch": 4.3, + "grad_norm": 0.424301607874042, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.8984375, + "logits/rejected": -1.5234375, + "logps/chosen": -149.0, + "logps/rejected": -892.0, + "loss": 0.3307832717895508, + "memory(GiB)": 69.46, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 13.0, + "rewards/rejected": 0.002349853515625, + "step": 215, + "train_speed(iter/s)": 0.114016 + }, + { + "epoch": 4.4, + "grad_norm": 0.22292040670451319, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.796875, + "logits/rejected": -1.53125, + "logps/chosen": -81.5, + "logps/rejected": -780.0, + "loss": 0.30391464233398435, + "memory(GiB)": 69.46, + "nll_loss": 0.1484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.8125, + "rewards/margins": 11.6875, + "rewards/rejected": 0.1201171875, + "step": 220, + "train_speed(iter/s)": 0.113876 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.0625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4443359375, + "eval_nll_loss": 0.39453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.7528, + "eval_samples_per_second": 1.066, + "eval_steps_per_second": 0.266, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.30310486108061985, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -1.75, + "logits/rejected": -1.8828125, + "logps/chosen": -524.0, + "logps/rejected": -197.0, + "loss": 0.34355936050415037, + "memory(GiB)": 69.46, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.875, + "rewards/margins": 20.75, + "rewards/rejected": 3.171875, + "step": 225, + "train_speed(iter/s)": 0.113594 + }, + { + "epoch": 4.6, + "grad_norm": 0.3741919235199971, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": -1.7421875, + "logits/rejected": -1.703125, + "logps/chosen": -252.0, + "logps/rejected": -676.0, + "loss": 0.3126819133758545, + "memory(GiB)": 69.46, + "nll_loss": 0.35546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.125, + "rewards/margins": 15.25, + "rewards/rejected": 1.8046875, + "step": 230, + "train_speed(iter/s)": 0.113544 + }, + { + "epoch": 4.7, + "grad_norm": 0.47750186161501185, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -1.875, + "logits/rejected": -1.640625, + "logps/chosen": -179.0, + "logps/rejected": -592.0, + "loss": 0.2852222442626953, + "memory(GiB)": 69.46, + "nll_loss": 0.2392578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.3125, + "rewards/margins": 14.125, + "rewards/rejected": -0.8515625, + "step": 235, + "train_speed(iter/s)": 0.113578 + }, + { + "epoch": 4.8, + "grad_norm": 0.28821358712762607, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -2.0, + "logits/rejected": -1.9453125, + "logps/chosen": -380.0, + "logps/rejected": -588.0, + "loss": 0.32334194183349607, + "memory(GiB)": 69.46, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 16.25, + "rewards/rejected": 2.828125, + "step": 240, + "train_speed(iter/s)": 0.113901 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.9140625, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.3125, + "eval_logps/rejected": -229.0, + "eval_loss": 0.44677734375, + "eval_nll_loss": 0.404296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.75, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 3.7327, + "eval_samples_per_second": 1.072, + "eval_steps_per_second": 0.268, + "step": 240 + }, + { + "epoch": 4.9, + "grad_norm": 0.3432327045772877, + "learning_rate": 1.0978021666005478e-07, + "logits/chosen": -1.6953125, + "logits/rejected": -1.8984375, + "logps/chosen": -456.0, + "logps/rejected": -476.0, + "loss": 0.3428853988647461, + "memory(GiB)": 69.46, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 15.6875, + "rewards/rejected": 1.921875, + "step": 245, + "train_speed(iter/s)": 0.113824 + }, + { + "epoch": 5.0, + "grad_norm": 1.6712453590352487, + "learning_rate": 0.0, + "logits/chosen": -1.7890625, + "logits/rejected": -1.90625, + "logps/chosen": -332.0, + "logps/rejected": -386.0, + "loss": 0.3745833396911621, + "memory(GiB)": 69.46, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.5625, + "rewards/rejected": 1.59375, + "step": 250, + "train_speed(iter/s)": 0.114113 + }, + { + "epoch": 5.0, + "eval_logits/chosen": -1.9140625, + "eval_logits/rejected": -1.6640625, + "eval_logps/chosen": -9.25, + "eval_logps/rejected": -230.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -1.203125, + "eval_runtime": 3.6978, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.27, + "step": 250 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 720119600250880.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3df3f944e370a0c7edb8ebd624a4183cbc518680 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f714f9923217090ffdf76d9f7b262aa72ccbd4fdf656007ae531ea8891c5ae4 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc870c84bdfea75f928520de51ba497bfc0d14e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8613767821698e688481078e3ec99dc1bad31edd7948cd65b83acebffb251a6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62cb53c277ed2df8c481b3b130951c3e35fe9dc7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c33a2e9584ce1faafd50d459cbbe76a8b7c440f5fc4a73b3ad494e116733cc +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf8bc9fa0c4ea312f46f11fc3ee5e4fe055c4c41 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ed1b8372c708ea6b291549a40540ecc61f036e5f72cf8cfd53ac36ae62b097 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb76a18ea7f4de676b582df2d13c7a0d728db01d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8084b70e765471cf26de257cf1f202fe37b0dce4838e931d511087f7cc18be3 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd7be668bf273646ae11b1522864e22b901824e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91469293aa0d22d8ca93e8ca89ec5ba59a31a35e0ee11e76e290a4a19a6ab674 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d86709caf706b946176bd66c313887ebc394f89 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28dfee7e03358e207c5fb3a60ae3a94bb71e6d688559d30ce78b115919078b02 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7017b5ef352d43f5faf6f044fdd0e165ed091c7c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adee555933d26a0e4785d65ee514f73307bf2031c4eb0aa987bbcc371811d624 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5d60fe1ea15f15d6579971fa84eee245c21a276 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb1ac0c5c6676ced436d6c7d873f48d91fa592de005cfc81b71466613d1c763 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afd26448610161980f83389f727fb1c7e12384c9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e085ec99f66d7d0af52d756897f1985a3cd2dd897bef8d94ef9897cca041182d +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41d4772e898805acd074c3ccf646bbd353bf8120 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26eedb4add806d5e06a1c23c0a9210eeefa44563ed1eb1abe03cd2950196625e +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebfe31c48c46a1583b87a294bc772457442a932e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b8eb769532bfd9131fb7c05d8ab9e4e4dd0216c54f5c1700e3f1cdbb20f30f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea7c658d78f301f142e64b5da169ec48621fbac1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f601b6762c81f1718e65faa2789c8fb0eaa4b4cc66732ba21e98b1f2a8acddf +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f76b140f623a51abc2b94d754e79008928a8cc3c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:204a80b0d81f381d03d0f205c902e635cfa7c9bd2e35f27a02856973e596e92f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a894f5d5873fd6f34862665f66ed3ddb4071fc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:436daa2d4ffd49559a612ad1ef775beae0fde39b211e73e44dde51e6bffce9cd +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e870bbf579217a0ecf645fc18820fb853a2fb3e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0294c4de7634613150fe1f2dc93cce28ac48833a1cebcc72e7b9cbfd375ddb55 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2bc85cc1cdca8ed7b1330c6ba5d0d1a34494e4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00637484f9a03fafb12f0adecf1553eca331026de857a830400189f8d934bb5a +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..35252259eb09d8de259231f63f19e786e44bc7b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb7d8df6ed170dd98dba8737bc9dd038af61afd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e144a445ffd57fbb5be9b5131f17149bde6c4ff5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10f35268ac2a0cb68abc0b78ba5b150b0f29d78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ef21562e384e0889ec2400e8f84b6b0bc59035 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..126662e5f97dd3f9cd7fb87e70843d3d5532dde3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4e6b27cc99b0fa8e6bbf967892f9304b444d81d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..14e214a61e5311f2b7edf2200ec0365ed2dcc5e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f899c511136dcc56c12c5058062d17686812d1b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab67fe2c6ce9ef0b836ccb0828d5bb1b593055af205b40af49c85139e46a6c8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e75cddd4e2533b7de6c76dadca930308e61c3076 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.46484375, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40", + "epoch": 0.8, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 115590834946048.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..193f5e5bab02cc58d3148e48f99110af6cac8cd6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eeb7c145f1d78e80d0f275b6a978c299297fe9805485f56efc59399a367c69a +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0b1577c90ee72405a3bc4be770de6dd66963ddf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bda55abf8821c70275d3e2e94f48f71e2f519d582f9324fb68e38b50b93d8b9 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16737a0693fe5970ef3d12949b01440419be34ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37dc4c664b3c2a22d526fc04c235f324744dddb9180b12088f5e1ab72caa64b +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ea40d2194ce13cc37c4828c10aa7f3c48dbac29 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cba06671474b0555798ef767031336f702770b8d67ee056c76fdca8950582bb +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e50161ca249617217f6b956615a2662576c3434 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc38c6abebc56065c94843cb16ea4ed08eed7a2ffb54e4ac7f01e3fff4ef815 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e38ec1fcbc674f25215c7f91bb4f1556e3599f3b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fd27add1279c14d5e65131825268059dc7ef61e31d8144608e89b5f89f6e7c +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca9f2aa82bf03c5b29d0eb89837e217ba9fb276b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27565be88065631584dc792d136f7be14bc893984168f3740e7f8de97ed34bf2 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7266e2a1c24e43da6d9dea4362148605a82d2e3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d68a63217f19e3479599967dc9bd94705bb4653f200a5df2ace8daec643c2a6 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1db8b5c97edffcc11f32dd6c251f90afdf249c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6dfd22d331fd49254f2b5627b6cc3287dc215984ccfa33a83f9b725500ebee +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71ccb80f7801d91fcb938a0dc6bff55147b56933 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:567c1ace8b1e93fd6c783a37c5dfdf7536609142cc71c09bbc2a7b61dc557c08 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08733c1c803590d0a99c7585fd92b2451491e7a8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7fa2fe1e7240deab9eab4e4304f56998d4cb61ebbdaa76b6f67a6ab50beeaed +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95daecebcad586f88bf0821b42cf9f02594da6d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731cb7d6895c1f4c1de4bcbf529e851e944579284bff323e574d3a654304ec14 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41a7ecc9ef32ab9e91ea4a2ef288364e03a07b68 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c458cd3a48a1f4248291a59e85a1b738bc6bd65d0984b80f024cb2b04e6b614c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ee789e691363e4beb416e61a755de8c4a8cb4bb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a8a296f63be3425e0d5b8ae125bc53d146791900e81fe0609d4554bf1ac383 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4a267c1699804ac9fb7c48b25cfc4c95988eab4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a7cff424966f84a2d175976e7a0d500c01562ffc722a6b16d233221d217bf7 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bf10104c1ada1ac62c91540c9dc0831239215af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bfc85ed774ad636887efc295f76c0e0566386edd44a158b1bbdf16532e3751 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c069015509ed9f38038788190639130a16dff981 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b678f065ab8621c0af00fd2ed6452fea96665bde9ba4d89ddf8ebf5cd0da5983 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..728c3241a49cbd920d5df86255fc8be4d97c5519 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa3ba485fff4300fd9029c17ba92c92630af852a00df0a0e8d16c233f74cbc8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae45e2f51f3e0ed4f11b36edcf83fb916bcb5c7a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.44628906, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60", + "epoch": 1.2, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 171964565553152.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2bcaf74ef04989c74b12702985590e9ac3b842 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ca7d7891b14eb2db3c4adb12136695f52062f87 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32257f4d81bc700ede89c7b27626b10145c17a78e8f154e498c60845b583097 +size 134337704 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..70e8e2a9b484918588a26cefd46a741cca04023f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..370ea214216c28b0441566744c5ed987e0a34f4d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f2adcb23c852e4064b75f6eb60dde3a5c1cdf1075a39a85885ad7bb4f79577 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce6796bc0dac6f940961c605e89d409f9e4c2804 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92151b83cdae771cc984161fc7411ecddec0524e05d89d5ee047d3ae574005bd +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6592b54d4a9c260429dd2c8e1a7101d52df24fb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bab809ffb44f38d895dc1615998c92cc46a5f4481be032dd37d4c39b7adfae +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..895e8b2da58a35ac199d9eb5e79b573cc2314a1d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52dba86d96ad9e7327b324abedf6ad4745668df51f71d1a6c3cbc4112315bae7 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..379e327a947e64aebdf627e9eda794fa61e5c2a5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f1a18202b649d62748321597d27df4b3efab7abfc2c6225c86551fe4bcf20 +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2297df3dc74bc702a5655b952a2723c528c2f64 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c491885d3895934ee59d4f357a39f81b45d44684a422fbcf22c2d139ac711f5f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3719a9952c4ace98fe598e3a4db0e8d9e189a5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8edf5dc032b1ccc25bb0d8f74ee5e430df1bbb481cdcc24120e43d1f2e334f +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab94b436dd3f2791b0e973b5bc7048cefc3df831 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d9b6b21a2cd4a0f7b599ed57a8b976da5f13f5b14f4f3b11f511c29c2f987a +size 100667312 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88e179a3c6cc94d5d7ab1baee5012bc040caa2a9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5433fa7cb36d9c6a0960218414f18cad807626a60cb0b9c4af04d834611e82b3 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9148bcfc4108d3b6ac68b0e24254e458e5acda1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b06cb84466a3d3e275e9dc70fb996bf4148b20ab3063417b2aae735c960c893 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a84a8772126e5af82470c89ade4dab170aa3d7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:009b0728223cb2e931eb99522cb7654987ce18e10def82e2517060c79ada8e1f +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b420efb31027b3892404f3342e715a1b6d54a2d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3ce249cd4c096e0cd3653f1f5c5667d11385cb513e0934aa1e6c51ede0fea9 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..882ac38b99a02b7e8548bcb82d0ea9b0c703f2b7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90473a56d60e1f1246de0f3705b97518cd82d450c7f589f25ebc253955805fd2 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ee03c817482049be4f004f5987c11c85ef5e571 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c83ee6333ed260743efb6d87eb4dba591eb0915f2e8992bd9fd9bf8926981cc +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36bb2642b6e0f1792b785ddcf8e371eb5b6454d3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:007241c4be968ef9b7e0252ee3646fc877d8755cca0334f1b72792ef4a9eea2c +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d3bc11afbd50cfb893487c891b1f4f463df63d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c63ce470a652b53ad06820e8e391ede60921dc0942fe3c286f240a5584fcb4 +size 886254 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd2a62da4ca83b3b986d96dbf0eaeb82207ca93 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0628a9017696045a3a29e9eaffc71e9262d855716e773c0c3be760a1fe85bc8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ba5f3aba4388a582cd47f7f9e57cd5879b1cbd2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df342004a4d8e3626bf2a9f689fde7c8bfd6d995e14931f5496eda1f456cb6f2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27b0f7845c2b9530c3e6ed3ce232ff4e86b86122 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02096eb4e8850b91490e80e4a042e2e60f71bd2abc6a269d62c271649cb77d2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfb583fc43c6dd4395671708744cfd18c419970 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c778d3d0e7e3d5665fa0a9ecd92986609c430da08b41611d6c05dc19815a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a8c64b1f15ac655b2be2a42fe61cabe2a877704 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d978dcb0c34e022ee6750e9d86814b8c82e4965d7e07662f35f06eeac12938f3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..262e8187e6caeca12ef3b0aa923b12afd697e03d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e83399aed1d9d173c3e07b2efa8530c956b62b2b68394c2ed0d43bd8bba9d1 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..72f794e31f8d3e0c63972e5076e1ed90c52087ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606ab3ca92e3d20c327c69fdcce7f7e39bec2f2c3538b036088b255f917e3ba4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..244e7fdaa1cef2e82bd4e16afb10f32f68318bcc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276a987dd22c9093fec58921ba19f340a28f18bff635cc01324e09a3c37ac3a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e36a588df493151f57c8f73aa08129a3810c2c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee30cdff92a069fa950619177f737b278c096bc7c83c0e5bdea15a673218022 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a66ea1c87ed3d9afa05f65789c42230960462edf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.43115234, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80", + "epoch": 1.6, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 4.512870866230471, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -1.6328125, + "logits/rejected": -1.7265625, + "logps/chosen": -482.0, + "logps/rejected": -127.0, + "loss": 2.2119140625, + "memory(GiB)": 16.32, + "nll_loss": 0.4765625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.069128 + }, + { + "epoch": 0.1, + "grad_norm": 4.490948635598285, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": -1.1953125, + "logits/rejected": -1.3671875, + "logps/chosen": -552.0, + "logps/rejected": -208.0, + "loss": 2.10546875, + "memory(GiB)": 16.32, + "nll_loss": 0.875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.114906 + }, + { + "epoch": 0.2, + "grad_norm": 3.956939059703609, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -1.75, + "logits/rejected": -1.78125, + "logps/chosen": -496.0, + "logps/rejected": -476.0, + "loss": 1.8939453125, + "memory(GiB)": 22.08, + "nll_loss": 1.21875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": -0.1298828125, + "rewards/rejected": 0.1396484375, + "step": 10, + "train_speed(iter/s)": 0.113412 + }, + { + "epoch": 0.3, + "grad_norm": 3.6228821312017345, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -1.515625, + "logits/rejected": -1.3515625, + "logps/chosen": -426.0, + "logps/rejected": -900.0, + "loss": 1.661376953125, + "memory(GiB)": 40.0, + "nll_loss": 1.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203125, + "rewards/margins": 1.8359375, + "rewards/rejected": 0.361328125, + "step": 15, + "train_speed(iter/s)": 0.108256 + }, + { + "epoch": 0.4, + "grad_norm": 2.027994249854522, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -1.8828125, + "logits/rejected": -1.65625, + "logps/chosen": -316.0, + "logps/rejected": -262.0, + "loss": 1.4244140625, + "memory(GiB)": 40.0, + "nll_loss": 0.494140625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.0, + "rewards/margins": 0.05615234375, + "rewards/rejected": 4.96875, + "step": 20, + "train_speed(iter/s)": 0.110602 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.4921875, + "eval_logits/rejected": -1.6328125, + "eval_logps/chosen": -20.125, + "eval_logps/rejected": -151.0, + "eval_loss": 0.7607421875, + "eval_nll_loss": 0.87890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.5625, + "eval_rewards/margins": 0.875, + "eval_rewards/rejected": 6.6875, + "eval_runtime": 3.7645, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.266, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 1.8463408544237272, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -1.5703125, + "logits/rejected": -1.390625, + "logps/chosen": -264.0, + "logps/rejected": -312.0, + "loss": 0.88779296875, + "memory(GiB)": 40.0, + "nll_loss": 0.42578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.375, + "rewards/margins": 2.328125, + "rewards/rejected": 6.0625, + "step": 25, + "train_speed(iter/s)": 0.109574 + }, + { + "epoch": 0.6, + "grad_norm": 1.680535134943694, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -1.9765625, + "logits/rejected": -1.859375, + "logps/chosen": -318.0, + "logps/rejected": -540.0, + "loss": 0.6205078125, + "memory(GiB)": 40.0, + "nll_loss": 0.6015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.5, + "rewards/margins": 3.546875, + "rewards/rejected": 4.9375, + "step": 30, + "train_speed(iter/s)": 0.112928 + }, + { + "epoch": 0.7, + "grad_norm": 4.744718397124442, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -1.6875, + "logits/rejected": -1.734375, + "logps/chosen": -450.0, + "logps/rejected": -221.0, + "loss": 0.5630126953125, + "memory(GiB)": 40.0, + "nll_loss": 0.72265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.4375, + "rewards/margins": 8.25, + "rewards/rejected": 2.15625, + "step": 35, + "train_speed(iter/s)": 0.115939 + }, + { + "epoch": 0.8, + "grad_norm": 0.48349890650801186, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.7265625, + "logps/chosen": -174.0, + "logps/rejected": -318.0, + "loss": 0.46617431640625, + "memory(GiB)": 40.0, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.625, + "rewards/margins": 7.8125, + "rewards/rejected": 1.8125, + "step": 40, + "train_speed(iter/s)": 0.114308 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -1.765625, + "eval_logits/rejected": -1.6484375, + "eval_logps/chosen": -8.9375, + "eval_logps/rejected": -204.0, + "eval_loss": 0.46484375, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.625, + "eval_rewards/margins": 7.21875, + "eval_rewards/rejected": 1.3984375, + "eval_runtime": 3.7281, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.268, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.48512233095475726, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -1.9375, + "logits/rejected": -1.8828125, + "logps/chosen": -388.0, + "logps/rejected": -406.0, + "loss": 0.455224609375, + "memory(GiB)": 45.68, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 12.4375, + "rewards/rejected": 3.171875, + "step": 45, + "train_speed(iter/s)": 0.112458 + }, + { + "epoch": 1.0, + "grad_norm": 0.7305067655375815, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -1.7421875, + "logits/rejected": -1.46875, + "logps/chosen": -386.0, + "logps/rejected": -924.0, + "loss": 0.43553466796875, + "memory(GiB)": 47.13, + "nll_loss": 0.384765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.1875, + "rewards/margins": 7.15625, + "rewards/rejected": 4.0, + "step": 50, + "train_speed(iter/s)": 0.113671 + }, + { + "epoch": 1.1, + "grad_norm": 0.2027321813402121, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -1.8125, + "logits/rejected": -1.828125, + "logps/chosen": -322.0, + "logps/rejected": -584.0, + "loss": 0.4790283203125, + "memory(GiB)": 57.7, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.875, + "rewards/margins": 10.6875, + "rewards/rejected": 3.1875, + "step": 55, + "train_speed(iter/s)": 0.113945 + }, + { + "epoch": 1.2, + "grad_norm": 0.17630688555010188, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -1.9140625, + "logits/rejected": -1.6796875, + "logps/chosen": -180.0, + "logps/rejected": -480.0, + "loss": 0.430853271484375, + "memory(GiB)": 57.7, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.9375, + "rewards/margins": 11.3125, + "rewards/rejected": 1.640625, + "step": 60, + "train_speed(iter/s)": 0.114879 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": -1.6171875, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.6875, + "eval_rewards/margins": 9.1875, + "eval_rewards/rejected": -0.5, + "eval_runtime": 3.7235, + "eval_samples_per_second": 1.074, + "eval_steps_per_second": 0.269, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.3355581056539049, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -1.734375, + "logits/rejected": -1.6015625, + "logps/chosen": -258.0, + "logps/rejected": -660.0, + "loss": 0.47626953125, + "memory(GiB)": 57.7, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.5625, + "rewards/margins": 10.1875, + "rewards/rejected": 1.375, + "step": 65, + "train_speed(iter/s)": 0.113843 + }, + { + "epoch": 1.4, + "grad_norm": 0.3090578542362743, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -2.046875, + "logits/rejected": -1.578125, + "logps/chosen": -53.0, + "logps/rejected": -864.0, + "loss": 0.421142578125, + "memory(GiB)": 57.7, + "nll_loss": 0.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.375, + "rewards/margins": 8.3125, + "rewards/rejected": 1.078125, + "step": 70, + "train_speed(iter/s)": 0.115066 + }, + { + "epoch": 1.5, + "grad_norm": 0.2679792251697414, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -1.875, + "logits/rejected": -1.6875, + "logps/chosen": -374.0, + "logps/rejected": -356.0, + "loss": 0.41944580078125, + "memory(GiB)": 57.7, + "nll_loss": 0.380859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.9375, + "rewards/margins": 12.3125, + "rewards/rejected": 1.6328125, + "step": 75, + "train_speed(iter/s)": 0.115391 + }, + { + "epoch": 1.6, + "grad_norm": 0.1784202423840373, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": -1.6640625, + "logits/rejected": -1.90625, + "logps/chosen": -454.0, + "logps/rejected": -109.0, + "loss": 0.45340576171875, + "memory(GiB)": 57.7, + "nll_loss": 0.54296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0, + "rewards/margins": 12.625, + "rewards/rejected": 3.421875, + "step": 80, + "train_speed(iter/s)": 0.116557 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.8125, + "eval_logits/rejected": -1.578125, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -228.0, + "eval_loss": 0.43115234375, + "eval_nll_loss": 0.310546875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.875, + "eval_rewards/margins": 9.875, + "eval_rewards/rejected": -1.0, + "eval_runtime": 3.6954, + "eval_samples_per_second": 1.082, + "eval_steps_per_second": 0.271, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 228284508405760.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..26c4e30e7018885534b71aa0deb376adf707fcf9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8b8b6a164238aed2ffe9591fef78cb3224b1fb267b021ec71b72b1f8483e1b +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2fd8747b9aa54a4a5e7211baefedfb120b2d2d01 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..2b974af23248586f7a8aa3fba6656862ff7eaacf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2e759963d8e4574fa5e756e26952d29588eafcbf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1f3c6d6dc7a27e7af626f03cb861c3dc038ef314 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..d7f445d16af9c0fb30f4039de33e9aced54e3133 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..bcb66b4fb16f15b2137dc759bc48a7ca212867d1 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd3f332a0a9276590b2d2aad6821619831dd64b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..8cbbbd97012869422af3f6d7a96a7ef14dd28dcf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..7a69e4ed2217e6951a1a1ab4c3705d2b5fcfc76f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..81a94ddea356cad368a23873e564528bcfde1412 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..84c3f7908112f2d0c7d7d7254cdc3cc058a574ff Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..5e0799501e088b8e74c70b8ac6c5f7c0e9500cdf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..383e0c2d135cb05c9388916bd8065fa42a0f86e2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..97933611536b8acda691b8294659ee4282ded3e6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..6fb1cd5ab94c7e75c6ebd5e951e67df8c787980b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..f869fa37203441700940366c27c7d20493a62bae Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..c0f537f263df87b1432f829a60bd05841f98689c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1a0a2a19590e32335b0d0684db7949fb18ff9316 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..04ce33a20468a926526b42f72df85f4da0648a9e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..04f38a675390aad746f650257fa1b0d6646fcbab Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7b29d25477581050bb5761176a9a5d17d4a8faf2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..0af0b5594d776a0a640f284ce7688baa25480fac Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c7f5eae3ad3be5c0d0cd0e4eea3cc8d72a4e4c78 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..2de357c19581604df231f97fccbd4dcc1510ec82 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2a4228539437014b10461068fb2b60045c9bb331 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..d87c0640dbdd24b13a96dcc552bd7f654bb3e07b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..dcdb3358ac6a5365df3a1d18246a18e026d646cf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..bd122976777d61d02abca17b8b235bff67b734dd Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b29bfeaf16bfdd6adf5787c009254487b930961f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..987a590ec4d1fcecf2bb20672dbae3e1a65d1eee Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..41ae62cd446b9a7df6624388d9923a5c630e97db Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..175965d039521134f416b808cab09b702f044a9a Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..2ad4a3197a5810144e06b50ede4fada02452e55e Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/logging.jsonl b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b86e4926030669f98beccf00ce3037566107fdc5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/logging.jsonl @@ -0,0 +1,66 @@ +{"loss": 2.21191406, "grad_norm": 4.51287087, "learning_rate": 7.69e-06, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.069128, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -127.0, "logps/chosen": -482.0, "logits/rejected": -1.7265625, "logits/chosen": -1.6328125, "nll_loss": 0.4765625, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "10s", "remaining_time": "41m 49s"} +{"loss": 2.10546875, "grad_norm": 4.49094864, "learning_rate": 3.846e-05, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.114906, "rewards/chosen": 0.02502441, "rewards/rejected": 0.0, "rewards/accuracies": 0.25, "rewards/margins": 0.02502441, "logps/rejected": -208.0, "logps/chosen": -552.0, "logits/rejected": -1.3671875, "logits/chosen": -1.1953125, "nll_loss": 0.875, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "39s", "remaining_time": "31m 57s"} +{"loss": 1.89394531, "grad_norm": 3.95693906, "learning_rate": 7.692e-05, "memory(GiB)": 22.08, "train_speed(iter/s)": 0.113412, "rewards/chosen": 0.01013184, "rewards/rejected": 0.13964844, "rewards/accuracies": 0.40000001, "rewards/margins": -0.12988281, "logps/rejected": -476.0, "logps/chosen": -496.0, "logits/rejected": -1.78125, "logits/chosen": -1.75, "nll_loss": 1.21875, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "1m 23s", "remaining_time": "33m 30s"} +{"loss": 1.66137695, "grad_norm": 3.62288213, "learning_rate": 9.998e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.108256, "rewards/chosen": 2.203125, "rewards/rejected": 0.36132812, "rewards/accuracies": 1.0, "rewards/margins": 1.8359375, "logps/rejected": -900.0, "logps/chosen": -426.0, "logits/rejected": -1.3515625, "logits/chosen": -1.515625, "nll_loss": 1.0, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "2m 14s", "remaining_time": "35m 2s"} +{"loss": 1.42441406, "grad_norm": 2.02799425, "learning_rate": 9.978e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.110602, "rewards/chosen": 5.0, "rewards/rejected": 4.96875, "rewards/accuracies": 0.60000002, "rewards/margins": 0.05615234, "logps/rejected": -262.0, "logps/chosen": -316.0, "logits/rejected": -1.65625, "logits/chosen": -1.8828125, "nll_loss": 0.49414062, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "2m 56s", "remaining_time": "33m 49s"} +{"eval_loss": 0.76074219, "eval_runtime": 3.7645, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 7.5625, "eval_rewards/rejected": 6.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 0.875, "eval_logps/rejected": -151.0, "eval_logps/chosen": -20.125, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.4921875, "eval_nll_loss": 0.87890625, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "3m 0s", "remaining_time": "34m 32s"} +{"loss": 0.88779297, "grad_norm": 1.84634085, "learning_rate": 9.937e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.109574, "rewards/chosen": 8.375, "rewards/rejected": 6.0625, "rewards/accuracies": 1.0, "rewards/margins": 2.328125, "logps/rejected": -312.0, "logps/chosen": -264.0, "logits/rejected": -1.390625, "logits/chosen": -1.5703125, "nll_loss": 0.42578125, "epoch": 0.5, "global_step/max_steps": "25/250", "percentage": "10.00%", "elapsed_time": "3m 43s", "remaining_time": "33m 33s"} +{"loss": 0.62050781, "grad_norm": 1.68053513, "learning_rate": 9.874e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.112928, "rewards/chosen": 8.5, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 3.546875, "logps/rejected": -540.0, "logps/chosen": -318.0, "logits/rejected": -1.859375, "logits/chosen": -1.9765625, "nll_loss": 0.6015625, "epoch": 0.6, "global_step/max_steps": "30/250", "percentage": "12.00%", "elapsed_time": "4m 21s", "remaining_time": "31m 55s"} +{"loss": 0.5630127, "grad_norm": 4.7447184, "learning_rate": 9.789e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.115939, "rewards/chosen": 10.4375, "rewards/rejected": 2.15625, "rewards/accuracies": 1.0, "rewards/margins": 8.25, "logps/rejected": -221.0, "logps/chosen": -450.0, "logits/rejected": -1.734375, "logits/chosen": -1.6875, "nll_loss": 0.72265625, "epoch": 0.7, "global_step/max_steps": "35/250", "percentage": "14.00%", "elapsed_time": "4m 57s", "remaining_time": "30m 27s"} +{"loss": 0.46617432, "grad_norm": 0.48349891, "learning_rate": 9.683e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.114308, "rewards/chosen": 9.625, "rewards/rejected": 1.8125, "rewards/accuracies": 1.0, "rewards/margins": 7.8125, "logps/rejected": -318.0, "logps/chosen": -174.0, "logits/rejected": -1.7265625, "logits/chosen": -1.9375, "nll_loss": 0.50390625, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "5m 45s", "remaining_time": "30m 14s"} +{"eval_loss": 0.46484375, "eval_runtime": 3.7281, "eval_samples_per_second": 1.073, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": 1.3984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.21875, "eval_logps/rejected": -204.0, "eval_logps/chosen": -8.9375, "eval_logits/rejected": -1.6484375, "eval_logits/chosen": -1.765625, "eval_nll_loss": 0.38671875, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "5m 49s", "remaining_time": "30m 33s"} +{"loss": 0.45522461, "grad_norm": 0.48512233, "learning_rate": 9.557e-05, "memory(GiB)": 45.68, "train_speed(iter/s)": 0.112458, "rewards/chosen": 15.5625, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 12.4375, "logps/rejected": -406.0, "logps/chosen": -388.0, "logits/rejected": -1.8828125, "logits/chosen": -1.9375, "nll_loss": 0.46875, "epoch": 0.9, "global_step/max_steps": "45/250", "percentage": "18.00%", "elapsed_time": "6m 35s", "remaining_time": "30m 2s"} +{"loss": 0.43553467, "grad_norm": 0.73050677, "learning_rate": 9.411e-05, "memory(GiB)": 47.13, "train_speed(iter/s)": 0.113671, "rewards/chosen": 11.1875, "rewards/rejected": 4.0, "rewards/accuracies": 1.0, "rewards/margins": 7.15625, "logps/rejected": -924.0, "logps/chosen": -386.0, "logits/rejected": -1.46875, "logits/chosen": -1.7421875, "nll_loss": 0.38476562, "epoch": 1.0, "global_step/max_steps": "50/250", "percentage": "20.00%", "elapsed_time": "7m 15s", "remaining_time": "29m 1s"} +{"loss": 0.47902832, "grad_norm": 0.20273218, "learning_rate": 9.245e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113945, "rewards/chosen": 13.875, "rewards/rejected": 3.1875, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -584.0, "logps/chosen": -322.0, "logits/rejected": -1.828125, "logits/chosen": -1.8125, "nll_loss": 0.4921875, "epoch": 1.1, "global_step/max_steps": "55/250", "percentage": "22.00%", "elapsed_time": "7m 58s", "remaining_time": "28m 15s"} +{"loss": 0.43085327, "grad_norm": 0.17630689, "learning_rate": 9.061e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114879, "rewards/chosen": 12.9375, "rewards/rejected": 1.640625, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -180.0, "logits/rejected": -1.6796875, "logits/chosen": -1.9140625, "nll_loss": 0.32617188, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "8m 37s", "remaining_time": "27m 20s"} +{"eval_loss": 0.44628906, "eval_runtime": 3.7235, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.1875, "eval_logps/rejected": -223.0, "eval_logps/chosen": -8.3125, "eval_logits/rejected": -1.6171875, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.36132812, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "8m 41s", "remaining_time": "27m 31s"} +{"loss": 0.47626953, "grad_norm": 0.33555811, "learning_rate": 8.858e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113843, "rewards/chosen": 11.5625, "rewards/rejected": 1.375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -660.0, "logps/chosen": -258.0, "logits/rejected": -1.6015625, "logits/chosen": -1.734375, "nll_loss": 0.48046875, "epoch": 1.3, "global_step/max_steps": "65/250", "percentage": "26.00%", "elapsed_time": "9m 26s", "remaining_time": "26m 52s"} +{"loss": 0.42114258, "grad_norm": 0.30905785, "learning_rate": 8.639e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115066, "rewards/chosen": 9.375, "rewards/rejected": 1.078125, "rewards/accuracies": 1.0, "rewards/margins": 8.3125, "logps/rejected": -864.0, "logps/chosen": -53.0, "logits/rejected": -1.578125, "logits/chosen": -2.046875, "nll_loss": 0.53125, "epoch": 1.4, "global_step/max_steps": "70/250", "percentage": "28.00%", "elapsed_time": "10m 3s", "remaining_time": "25m 53s"} +{"loss": 0.4194458, "grad_norm": 0.26797923, "learning_rate": 8.404e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115391, "rewards/chosen": 13.9375, "rewards/rejected": 1.6328125, "rewards/accuracies": 1.0, "rewards/margins": 12.3125, "logps/rejected": -356.0, "logps/chosen": -374.0, "logits/rejected": -1.6875, "logits/chosen": -1.875, "nll_loss": 0.38085938, "epoch": 1.5, "global_step/max_steps": "75/250", "percentage": "30.00%", "elapsed_time": "10m 45s", "remaining_time": "25m 6s"} +{"loss": 0.45340576, "grad_norm": 0.17842024, "learning_rate": 8.154e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.116557, "rewards/chosen": 16.0, "rewards/rejected": 3.421875, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -109.0, "logps/chosen": -454.0, "logits/rejected": -1.90625, "logits/chosen": -1.6640625, "nll_loss": 0.54296875, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "11m 21s", "remaining_time": "24m 9s"} +{"eval_loss": 0.43115234, "eval_runtime": 3.6954, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -1.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.875, "eval_logps/rejected": -228.0, "eval_logps/chosen": -7.125, "eval_logits/rejected": -1.578125, "eval_logits/chosen": -1.8125, "eval_nll_loss": 0.31054688, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "11m 25s", "remaining_time": "24m 17s"} +{"loss": 0.56628418, "grad_norm": 0.37601944, "learning_rate": 7.89e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115467, "rewards/chosen": 12.3125, "rewards/rejected": 1.5234375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -620.0, "logps/chosen": -237.0, "logits/rejected": -1.671875, "logits/chosen": -1.7578125, "nll_loss": 0.40625, "epoch": 1.7, "global_step/max_steps": "85/250", "percentage": "34.00%", "elapsed_time": "12m 11s", "remaining_time": "23m 40s"} +{"loss": 0.42229004, "grad_norm": 0.31117947, "learning_rate": 7.614e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114775, "rewards/chosen": 13.125, "rewards/rejected": 0.96875, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -426.0, "logps/chosen": -240.0, "logits/rejected": -1.8046875, "logits/chosen": -1.75, "nll_loss": 0.4453125, "epoch": 1.8, "global_step/max_steps": "90/250", "percentage": "36.00%", "elapsed_time": "12m 59s", "remaining_time": "23m 6s"} +{"loss": 0.46682129, "grad_norm": 0.33464928, "learning_rate": 7.326e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114788, "rewards/chosen": 15.0, "rewards/rejected": 2.375, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -446.0, "logps/chosen": -286.0, "logits/rejected": -1.4921875, "logits/chosen": -1.796875, "nll_loss": 0.38671875, "epoch": 1.9, "global_step/max_steps": "95/250", "percentage": "38.00%", "elapsed_time": "13m 43s", "remaining_time": "22m 23s"} +{"loss": 0.41916199, "grad_norm": 0.26956367, "learning_rate": 7.028e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114564, "rewards/chosen": 12.0, "rewards/rejected": 1.15625, "rewards/accuracies": 1.0, "rewards/margins": 10.8125, "logps/rejected": -288.0, "logps/chosen": -222.0, "logits/rejected": -1.609375, "logits/chosen": -1.7578125, "nll_loss": 0.36328125, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "14m 28s", "remaining_time": "21m 42s"} +{"eval_loss": 0.42919922, "eval_runtime": 3.7095, "eval_samples_per_second": 1.078, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.8125, "eval_rewards/rejected": -0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.3125, "eval_logps/rejected": -223.0, "eval_logps/chosen": -7.25, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8515625, "eval_nll_loss": 0.31445312, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "14m 32s", "remaining_time": "21m 48s"} +{"loss": 0.43474121, "grad_norm": 0.23160498, "learning_rate": 6.72e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113576, "rewards/chosen": 15.0625, "rewards/rejected": 3.765625, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -540.0, "logps/chosen": -366.0, "logits/rejected": -1.796875, "logits/chosen": -1.8515625, "nll_loss": 0.55078125, "epoch": 2.1, "global_step/max_steps": "105/250", "percentage": "42.00%", "elapsed_time": "15m 20s", "remaining_time": "21m 10s"} +{"loss": 0.42264404, "grad_norm": 0.32495459, "learning_rate": 6.406e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113607, "rewards/chosen": 13.625, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -824.0, "logps/chosen": -332.0, "logits/rejected": -1.453125, "logits/chosen": -1.5390625, "nll_loss": 0.37890625, "epoch": 2.2, "global_step/max_steps": "110/250", "percentage": "44.00%", "elapsed_time": "16m 3s", "remaining_time": "20m 26s"} +{"loss": 0.44799805, "grad_norm": 0.16989864, "learning_rate": 6.085e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113793, "rewards/chosen": 16.0, "rewards/rejected": 2.96875, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -576.0, "logps/chosen": -414.0, "logits/rejected": -1.6328125, "logits/chosen": -1.7734375, "nll_loss": 0.40234375, "epoch": 2.3, "global_step/max_steps": "115/250", "percentage": "46.00%", "elapsed_time": "16m 46s", "remaining_time": "19m 41s"} +{"loss": 0.36958008, "grad_norm": 0.28580004, "learning_rate": 5.759e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114561, "rewards/chosen": 13.125, "rewards/rejected": 1.53125, "rewards/accuracies": 1.0, "rewards/margins": 11.625, "logps/rejected": -286.0, "logps/chosen": -312.0, "logits/rejected": -1.765625, "logits/chosen": -1.828125, "nll_loss": 0.31640625, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "17m 23s", "remaining_time": "18m 50s"} +{"eval_loss": 0.42285156, "eval_runtime": 3.7647, "eval_samples_per_second": 1.062, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/rejected": -224.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": -1.6171875, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.296875, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "17m 26s", "remaining_time": "18m 54s"} +{"loss": 0.41490173, "grad_norm": 0.30453528, "learning_rate": 5.43e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114475, "rewards/chosen": 10.375, "rewards/rejected": -0.59765625, "rewards/accuracies": 1.0, "rewards/margins": 10.9375, "logps/rejected": -920.0, "logps/chosen": -43.25, "logits/rejected": -1.703125, "logits/chosen": -2.140625, "nll_loss": 0.52734375, "epoch": 2.5, "global_step/max_steps": "125/250", "percentage": "50.00%", "elapsed_time": "18m 7s", "remaining_time": "18m 7s"} +{"loss": 0.40163574, "grad_norm": 0.42496086, "learning_rate": 5.099e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115111, "rewards/chosen": 16.625, "rewards/rejected": 1.8984375, "rewards/accuracies": 1.0, "rewards/margins": 14.75, "logps/rejected": -238.0, "logps/chosen": -330.0, "logits/rejected": -1.8671875, "logits/chosen": -1.8203125, "nll_loss": 0.4375, "epoch": 2.6, "global_step/max_steps": "130/250", "percentage": "52.00%", "elapsed_time": "18m 44s", "remaining_time": "17m 18s"} +{"loss": 0.3717247, "grad_norm": 0.40794794, "learning_rate": 4.768e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115369, "rewards/chosen": 11.0, "rewards/rejected": 0.33984375, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -516.0, "logps/chosen": -127.5, "logits/rejected": -1.7734375, "logits/chosen": -1.875, "nll_loss": 0.21679688, "epoch": 2.7, "global_step/max_steps": "135/250", "percentage": "54.00%", "elapsed_time": "19m 25s", "remaining_time": "16m 33s"} +{"loss": 0.39625893, "grad_norm": 0.20371861, "learning_rate": 4.438e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115894, "rewards/chosen": 14.1875, "rewards/rejected": 0.91796875, "rewards/accuracies": 1.0, "rewards/margins": 13.25, "logps/rejected": -580.0, "logps/chosen": -290.0, "logits/rejected": -1.8359375, "logits/chosen": -1.96875, "nll_loss": 0.55078125, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "20m 3s", "remaining_time": "15m 45s"} +{"eval_loss": 0.41845703, "eval_runtime": 3.6938, "eval_samples_per_second": 1.083, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/rejected": -224.0, "eval_logps/chosen": -6.5625, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.8984375, "eval_nll_loss": 0.28515625, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "20m 7s", "remaining_time": "15m 48s"} +{"loss": 0.43898315, "grad_norm": 0.66314645, "learning_rate": 4.11e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114627, "rewards/chosen": 14.8125, "rewards/rejected": 3.015625, "rewards/accuracies": 1.0, "rewards/margins": 11.8125, "logps/rejected": -532.0, "logps/chosen": -482.0, "logits/rejected": -1.84375, "logits/chosen": -1.8125, "nll_loss": 0.55859375, "epoch": 2.9, "global_step/max_steps": "145/250", "percentage": "58.00%", "elapsed_time": "21m 0s", "remaining_time": "15m 12s"} +{"loss": 0.33739624, "grad_norm": 0.35248744, "learning_rate": 3.786e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114227, "rewards/chosen": 19.25, "rewards/rejected": 3.125, "rewards/accuracies": 1.0, "rewards/margins": 16.0, "logps/rejected": -235.0, "logps/chosen": -348.0, "logits/rejected": -1.8671875, "logits/chosen": -1.5546875, "nll_loss": 0.328125, "epoch": 3.0, "global_step/max_steps": "150/250", "percentage": "60.00%", "elapsed_time": "21m 48s", "remaining_time": "14m 32s"} +{"loss": 0.41593018, "grad_norm": 0.22136085, "learning_rate": 3.468e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114508, "rewards/chosen": 16.875, "rewards/rejected": 2.21875, "rewards/accuracies": 1.0, "rewards/margins": 14.625, "logps/rejected": -490.0, "logps/chosen": -444.0, "logits/rejected": -1.78125, "logits/chosen": -1.7265625, "nll_loss": 0.40625, "epoch": 3.1, "global_step/max_steps": "155/250", "percentage": "62.00%", "elapsed_time": "22m 29s", "remaining_time": "13m 46s"} +{"loss": 0.37825317, "grad_norm": 0.2740483, "learning_rate": 3.156e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114379, "rewards/chosen": 17.75, "rewards/rejected": 2.9375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -306.0, "logps/chosen": -380.0, "logits/rejected": -2.046875, "logits/chosen": -1.859375, "nll_loss": 0.36523438, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "23m 14s", "remaining_time": "13m 4s"} +{"eval_loss": 0.421875, "eval_runtime": 3.7366, "eval_samples_per_second": 1.07, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.0625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -6.9375, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.30078125, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "23m 18s", "remaining_time": "13m 6s"} +{"loss": 0.35755234, "grad_norm": 1.0146972, "learning_rate": 2.852e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114154, "rewards/chosen": 12.3125, "rewards/rejected": -1.1640625, "rewards/accuracies": 1.0, "rewards/margins": 13.5, "logps/rejected": -412.0, "logps/chosen": -133.0, "logits/rejected": -1.6953125, "logits/chosen": -1.7421875, "nll_loss": 0.15039062, "epoch": 3.3, "global_step/max_steps": "165/250", "percentage": "66.00%", "elapsed_time": "24m 1s", "remaining_time": "12m 22s"} +{"loss": 0.35981827, "grad_norm": 0.3508282, "learning_rate": 2.558e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113875, "rewards/chosen": 22.0, "rewards/rejected": 3.0625, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -141.0, "logps/chosen": -498.0, "logits/rejected": -1.8671875, "logits/chosen": -1.53125, "nll_loss": 0.34375, "epoch": 3.4, "global_step/max_steps": "170/250", "percentage": "68.00%", "elapsed_time": "24m 48s", "remaining_time": "11m 40s"} +{"loss": 0.32151184, "grad_norm": 0.45650429, "learning_rate": 2.274e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114047, "rewards/chosen": 15.625, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 13.625, "logps/rejected": -430.0, "logps/chosen": -306.0, "logits/rejected": -1.546875, "logits/chosen": -1.6015625, "nll_loss": 0.43945312, "epoch": 3.5, "global_step/max_steps": "175/250", "percentage": "70.00%", "elapsed_time": "25m 30s", "remaining_time": "10m 55s"} +{"loss": 0.40310822, "grad_norm": 0.28047321, "learning_rate": 2.002e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114563, "rewards/chosen": 17.75, "rewards/rejected": 1.734375, "rewards/accuracies": 1.0, "rewards/margins": 15.9375, "logps/rejected": -232.0, "logps/chosen": -426.0, "logits/rejected": -1.9765625, "logits/chosen": -1.7890625, "nll_loss": 0.50390625, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "26m 6s", "remaining_time": "10m 9s"} +{"eval_loss": 0.43261719, "eval_runtime": 3.7191, "eval_samples_per_second": 1.076, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.75, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.9375, "eval_logps/rejected": -230.0, "eval_logps/chosen": -8.0, "eval_logits/rejected": -1.6484375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.34765625, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "26m 10s", "remaining_time": "10m 10s"} +{"loss": 0.35445404, "grad_norm": 0.29353907, "learning_rate": 1.744e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114484, "rewards/chosen": 17.5, "rewards/rejected": 2.296875, "rewards/accuracies": 1.0, "rewards/margins": 15.1875, "logps/rejected": -183.0, "logps/chosen": -412.0, "logits/rejected": -1.8984375, "logits/chosen": -1.671875, "nll_loss": 0.46679688, "epoch": 3.7, "global_step/max_steps": "185/250", "percentage": "74.00%", "elapsed_time": "26m 51s", "remaining_time": "9m 26s"} +{"loss": 0.35137253, "grad_norm": 0.34561753, "learning_rate": 1.5e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.11489, "rewards/chosen": 15.875, "rewards/rejected": 0.67578125, "rewards/accuracies": 1.0, "rewards/margins": 15.1875, "logps/rejected": -552.0, "logps/chosen": -246.0, "logits/rejected": -1.828125, "logits/chosen": -2.0625, "nll_loss": 0.53515625, "epoch": 3.8, "global_step/max_steps": "190/250", "percentage": "76.00%", "elapsed_time": "27m 29s", "remaining_time": "8m 40s"} +{"loss": 0.29118738, "grad_norm": 0.35528088, "learning_rate": 1.271e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114611, "rewards/chosen": 18.25, "rewards/rejected": 1.609375, "rewards/accuracies": 1.0, "rewards/margins": 16.625, "logps/rejected": -496.0, "logps/chosen": -348.0, "logits/rejected": -1.625, "logits/chosen": -1.5859375, "nll_loss": 0.33984375, "epoch": 3.9, "global_step/max_steps": "195/250", "percentage": "78.00%", "elapsed_time": "28m 17s", "remaining_time": "7m 58s"} +{"loss": 0.33329735, "grad_norm": 0.59754957, "learning_rate": 1.059e-05, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114365, "rewards/chosen": 16.125, "rewards/rejected": 1.859375, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -508.0, "logps/chosen": -270.0, "logits/rejected": -1.5078125, "logits/chosen": -1.59375, "nll_loss": 0.31054688, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "29m 4s", "remaining_time": "7m 16s"} +{"eval_loss": 0.43798828, "eval_runtime": 3.6821, "eval_samples_per_second": 1.086, "eval_steps_per_second": 0.272, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -229.0, "eval_logps/chosen": -8.5, "eval_logits/rejected": -1.65625, "eval_logits/chosen": -1.9296875, "eval_nll_loss": 0.36914062, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "29m 8s", "remaining_time": "7m 17s"} +{"loss": 0.36646633, "grad_norm": 0.33317032, "learning_rate": 8.63e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113733, "rewards/chosen": 15.5625, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 13.5, "logps/rejected": -556.0, "logps/chosen": -324.0, "logits/rejected": -1.6875, "logits/chosen": -1.75, "nll_loss": 0.34375, "epoch": 4.1, "global_step/max_steps": "205/250", "percentage": "82.00%", "elapsed_time": "29m 58s", "remaining_time": "6m 34s"} +{"loss": 0.31939468, "grad_norm": 0.36653812, "learning_rate": 6.87e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113959, "rewards/chosen": 17.875, "rewards/rejected": 3.109375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -192.0, "logps/chosen": -255.0, "logits/rejected": -1.7734375, "logits/chosen": -1.7890625, "nll_loss": 0.31640625, "epoch": 4.2, "global_step/max_steps": "210/250", "percentage": "84.00%", "elapsed_time": "30m 38s", "remaining_time": "5m 50s"} +{"loss": 0.33078327, "grad_norm": 0.42430161, "learning_rate": 5.29e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114016, "rewards/chosen": 13.0, "rewards/rejected": 0.00234985, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -892.0, "logps/chosen": -149.0, "logits/rejected": -1.5234375, "logits/chosen": -1.8984375, "nll_loss": 0.23828125, "epoch": 4.3, "global_step/max_steps": "215/250", "percentage": "86.00%", "elapsed_time": "31m 21s", "remaining_time": "5m 6s"} +{"loss": 0.30391464, "grad_norm": 0.22292041, "learning_rate": 3.9e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113876, "rewards/chosen": 11.8125, "rewards/rejected": 0.12011719, "rewards/accuracies": 1.0, "rewards/margins": 11.6875, "logps/rejected": -780.0, "logps/chosen": -81.5, "logits/rejected": -1.53125, "logits/chosen": -1.796875, "nll_loss": 0.1484375, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "32m 7s", "remaining_time": "4m 22s"} +{"eval_loss": 0.44433594, "eval_runtime": 3.7528, "eval_samples_per_second": 1.066, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.0625, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.39453125, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "32m 11s", "remaining_time": "4m 23s"} +{"loss": 0.34355936, "grad_norm": 0.30310486, "learning_rate": 2.72e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113594, "rewards/chosen": 23.875, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 20.75, "logps/rejected": -197.0, "logps/chosen": -524.0, "logits/rejected": -1.8828125, "logits/chosen": -1.75, "nll_loss": 0.55078125, "epoch": 4.5, "global_step/max_steps": "225/250", "percentage": "90.00%", "elapsed_time": "32m 56s", "remaining_time": "3m 39s"} +{"loss": 0.31268191, "grad_norm": 0.37419192, "learning_rate": 1.75e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113544, "rewards/chosen": 17.125, "rewards/rejected": 1.8046875, "rewards/accuracies": 1.0, "rewards/margins": 15.25, "logps/rejected": -676.0, "logps/chosen": -252.0, "logits/rejected": -1.703125, "logits/chosen": -1.7421875, "nll_loss": 0.35546875, "epoch": 4.6, "global_step/max_steps": "230/250", "percentage": "92.00%", "elapsed_time": "33m 41s", "remaining_time": "2m 55s"} +{"loss": 0.28522224, "grad_norm": 0.47750186, "learning_rate": 9.9e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113578, "rewards/chosen": 13.3125, "rewards/rejected": -0.8515625, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -592.0, "logps/chosen": -179.0, "logits/rejected": -1.640625, "logits/chosen": -1.875, "nll_loss": 0.23925781, "epoch": 4.7, "global_step/max_steps": "235/250", "percentage": "94.00%", "elapsed_time": "34m 24s", "remaining_time": "2m 11s"} +{"loss": 0.32334194, "grad_norm": 0.28821359, "learning_rate": 4.4e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113901, "rewards/chosen": 19.0, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 16.25, "logps/rejected": -588.0, "logps/chosen": -380.0, "logits/rejected": -1.9453125, "logits/chosen": -2.0, "nll_loss": 0.578125, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "35m 2s", "remaining_time": "1m 27s"} +{"eval_loss": 0.44677734, "eval_runtime": 3.7327, "eval_samples_per_second": 1.072, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.75, "eval_logps/rejected": -229.0, "eval_logps/chosen": -9.3125, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.9140625, "eval_nll_loss": 0.40429688, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "35m 6s", "remaining_time": "1m 27s"} +{"loss": 0.3428854, "grad_norm": 0.3432327, "learning_rate": 1.1e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113824, "rewards/chosen": 17.5, "rewards/rejected": 1.921875, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -476.0, "logps/chosen": -456.0, "logits/rejected": -1.8984375, "logits/chosen": -1.6953125, "nll_loss": 0.47070312, "epoch": 4.9, "global_step/max_steps": "245/250", "percentage": "98.00%", "elapsed_time": "35m 48s", "remaining_time": "43s"} +{"loss": 0.37458334, "grad_norm": 1.67124536, "learning_rate": 0.0, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114113, "rewards/chosen": 16.125, "rewards/rejected": 1.59375, "rewards/accuracies": 1.0, "rewards/margins": 14.5625, "logps/rejected": -386.0, "logps/chosen": -332.0, "logits/rejected": -1.90625, "logits/chosen": -1.7890625, "nll_loss": 0.38085938, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 26s", "remaining_time": "0s"} +{"eval_loss": 0.44628906, "eval_runtime": 3.6978, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.25, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.9140625, "eval_nll_loss": 0.40234375, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 30s", "remaining_time": "0s"} +{"train_runtime": 2192.7386, "train_samples_per_second": 0.901, "train_steps_per_second": 0.114, "total_flos": 720119600250880.0, "train_loss": 0.51989253, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "36m 32s", "remaining_time": "0s"} +{"train_dataset": "1163.088608±494.952093, min=300.000000, max=4154.000000, size=395", "val_dataset": "1179.000000±512.550973, min=698.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 32830.9852M Params (67.1089M Trainable [0.2044%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-250", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/checkpoint-140", "best_metric": 0.41845703, "global_step": 250, "log_history": [{"loss": 2.2119140625, "grad_norm": 4.512870866230471, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.069128, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -127.0, "logps/chosen": -482.0, "logits/rejected": -1.7265625, "logits/chosen": -1.6328125, "nll_loss": 0.4765625, "epoch": 0.02, "step": 1}, {"loss": 2.10546875, "grad_norm": 4.490948635598285, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 16.32, "train_speed(iter/s)": 0.114906, "rewards/chosen": 0.0250244140625, "rewards/rejected": 0.0, "rewards/accuracies": 0.25, "rewards/margins": 0.0250244140625, "logps/rejected": -208.0, "logps/chosen": -552.0, "logits/rejected": -1.3671875, "logits/chosen": -1.1953125, "nll_loss": 0.875, "epoch": 0.1, "step": 5}, {"loss": 1.8939453125, "grad_norm": 3.956939059703609, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 22.08, "train_speed(iter/s)": 0.113412, "rewards/chosen": 0.0101318359375, "rewards/rejected": 0.1396484375, "rewards/accuracies": 0.4000000059604645, "rewards/margins": -0.1298828125, "logps/rejected": -476.0, "logps/chosen": -496.0, "logits/rejected": -1.78125, "logits/chosen": -1.75, "nll_loss": 1.21875, "epoch": 0.2, "step": 10}, {"loss": 1.661376953125, "grad_norm": 3.6228821312017345, "learning_rate": 9.998242976313776e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.108256, "rewards/chosen": 2.203125, "rewards/rejected": 0.361328125, "rewards/accuracies": 1.0, "rewards/margins": 1.8359375, "logps/rejected": -900.0, "logps/chosen": -426.0, "logits/rejected": -1.3515625, "logits/chosen": -1.515625, "nll_loss": 1.0, "epoch": 0.3, "step": 15}, {"loss": 1.4244140625, "grad_norm": 2.027994249854522, "learning_rate": 9.97849063861667e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.110602, "rewards/chosen": 5.0, "rewards/rejected": 4.96875, "rewards/accuracies": 0.6000000238418579, "rewards/margins": 0.05615234375, "logps/rejected": -262.0, "logps/chosen": -316.0, "logits/rejected": -1.65625, "logits/chosen": -1.8828125, "nll_loss": 0.494140625, "epoch": 0.4, "step": 20}, {"eval_loss": 0.7607421875, "eval_runtime": 3.7645, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 7.5625, "eval_rewards/rejected": 6.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 0.875, "eval_logps/rejected": -151.0, "eval_logps/chosen": -20.125, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.4921875, "eval_nll_loss": 0.87890625, "epoch": 0.4, "step": 20}, {"loss": 0.88779296875, "grad_norm": 1.8463408544237272, "learning_rate": 9.936876709681668e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.109574, "rewards/chosen": 8.375, "rewards/rejected": 6.0625, "rewards/accuracies": 1.0, "rewards/margins": 2.328125, "logps/rejected": -312.0, "logps/chosen": -264.0, "logits/rejected": -1.390625, "logits/chosen": -1.5703125, "nll_loss": 0.42578125, "epoch": 0.5, "step": 25}, {"loss": 0.6205078125, "grad_norm": 1.680535134943694, "learning_rate": 9.873583924954152e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.112928, "rewards/chosen": 8.5, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 3.546875, "logps/rejected": -540.0, "logps/chosen": -318.0, "logits/rejected": -1.859375, "logits/chosen": -1.9765625, "nll_loss": 0.6015625, "epoch": 0.6, "step": 30}, {"loss": 0.5630126953125, "grad_norm": 4.744718397124442, "learning_rate": 9.788890216258939e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.115939, "rewards/chosen": 10.4375, "rewards/rejected": 2.15625, "rewards/accuracies": 1.0, "rewards/margins": 8.25, "logps/rejected": -221.0, "logps/chosen": -450.0, "logits/rejected": -1.734375, "logits/chosen": -1.6875, "nll_loss": 0.72265625, "epoch": 0.7, "step": 35}, {"loss": 0.46617431640625, "grad_norm": 0.48349890650801186, "learning_rate": 9.68316749134364e-05, "memory(GiB)": 40.0, "train_speed(iter/s)": 0.114308, "rewards/chosen": 9.625, "rewards/rejected": 1.8125, "rewards/accuracies": 1.0, "rewards/margins": 7.8125, "logps/rejected": -318.0, "logps/chosen": -174.0, "logits/rejected": -1.7265625, "logits/chosen": -1.9375, "nll_loss": 0.50390625, "epoch": 0.8, "step": 40}, {"eval_loss": 0.46484375, "eval_runtime": 3.7281, "eval_samples_per_second": 1.073, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": 1.3984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.21875, "eval_logps/rejected": -204.0, "eval_logps/chosen": -8.9375, "eval_logits/rejected": -1.6484375, "eval_logits/chosen": -1.765625, "eval_nll_loss": 0.38671875, "epoch": 0.8, "step": 40}, {"loss": 0.455224609375, "grad_norm": 0.48512233095475726, "learning_rate": 9.55688000075414e-05, "memory(GiB)": 45.68, "train_speed(iter/s)": 0.112458, "rewards/chosen": 15.5625, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 12.4375, "logps/rejected": -406.0, "logps/chosen": -388.0, "logits/rejected": -1.8828125, "logits/chosen": -1.9375, "nll_loss": 0.46875, "epoch": 0.9, "step": 45}, {"loss": 0.43553466796875, "grad_norm": 0.7305067655375815, "learning_rate": 9.410582299213573e-05, "memory(GiB)": 47.13, "train_speed(iter/s)": 0.113671, "rewards/chosen": 11.1875, "rewards/rejected": 4.0, "rewards/accuracies": 1.0, "rewards/margins": 7.15625, "logps/rejected": -924.0, "logps/chosen": -386.0, "logits/rejected": -1.46875, "logits/chosen": -1.7421875, "nll_loss": 0.384765625, "epoch": 1.0, "step": 50}, {"loss": 0.4790283203125, "grad_norm": 0.2027321813402121, "learning_rate": 9.244916810456821e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113945, "rewards/chosen": 13.875, "rewards/rejected": 3.1875, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -584.0, "logps/chosen": -322.0, "logits/rejected": -1.828125, "logits/chosen": -1.8125, "nll_loss": 0.4921875, "epoch": 1.1, "step": 55}, {"loss": 0.430853271484375, "grad_norm": 0.17630688555010188, "learning_rate": 9.060611006213832e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114879, "rewards/chosen": 12.9375, "rewards/rejected": 1.640625, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -180.0, "logits/rejected": -1.6796875, "logits/chosen": -1.9140625, "nll_loss": 0.326171875, "epoch": 1.2, "step": 60}, {"eval_loss": 0.4462890625, "eval_runtime": 3.7235, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.1875, "eval_logps/rejected": -223.0, "eval_logps/chosen": -8.3125, "eval_logits/rejected": -1.6171875, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.361328125, "epoch": 1.2, "step": 60}, {"loss": 0.47626953125, "grad_norm": 0.3355581056539049, "learning_rate": 8.858474211729469e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113843, "rewards/chosen": 11.5625, "rewards/rejected": 1.375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -660.0, "logps/chosen": -258.0, "logits/rejected": -1.6015625, "logits/chosen": -1.734375, "nll_loss": 0.48046875, "epoch": 1.3, "step": 65}, {"loss": 0.421142578125, "grad_norm": 0.3090578542362743, "learning_rate": 8.639394051847472e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115066, "rewards/chosen": 9.375, "rewards/rejected": 1.078125, "rewards/accuracies": 1.0, "rewards/margins": 8.3125, "logps/rejected": -864.0, "logps/chosen": -53.0, "logits/rejected": -1.578125, "logits/chosen": -2.046875, "nll_loss": 0.53125, "epoch": 1.4, "step": 70}, {"loss": 0.41944580078125, "grad_norm": 0.2679792251697414, "learning_rate": 8.404332553264547e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115391, "rewards/chosen": 13.9375, "rewards/rejected": 1.6328125, "rewards/accuracies": 1.0, "rewards/margins": 12.3125, "logps/rejected": -356.0, "logps/chosen": -374.0, "logits/rejected": -1.6875, "logits/chosen": -1.875, "nll_loss": 0.380859375, "epoch": 1.5, "step": 75}, {"loss": 0.45340576171875, "grad_norm": 0.1784202423840373, "learning_rate": 8.154321920070414e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.116557, "rewards/chosen": 16.0, "rewards/rejected": 3.421875, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -109.0, "logps/chosen": -454.0, "logits/rejected": -1.90625, "logits/chosen": -1.6640625, "nll_loss": 0.54296875, "epoch": 1.6, "step": 80}, {"eval_loss": 0.43115234375, "eval_runtime": 3.6954, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -1.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.875, "eval_logps/rejected": -228.0, "eval_logps/chosen": -7.125, "eval_logits/rejected": -1.578125, "eval_logits/chosen": -1.8125, "eval_nll_loss": 0.310546875, "epoch": 1.6, "step": 80}, {"loss": 0.5662841796875, "grad_norm": 0.3760194352348836, "learning_rate": 7.890460001124242e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115467, "rewards/chosen": 12.3125, "rewards/rejected": 1.5234375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -620.0, "logps/chosen": -237.0, "logits/rejected": -1.671875, "logits/chosen": -1.7578125, "nll_loss": 0.40625, "epoch": 1.7, "step": 85}, {"loss": 0.4222900390625, "grad_norm": 0.31117947132159346, "learning_rate": 7.613905469171246e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114775, "rewards/chosen": 13.125, "rewards/rejected": 0.96875, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -426.0, "logps/chosen": -240.0, "logits/rejected": -1.8046875, "logits/chosen": -1.75, "nll_loss": 0.4453125, "epoch": 1.8, "step": 90}, {"loss": 0.4668212890625, "grad_norm": 0.3346492838768647, "learning_rate": 7.325872732868869e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114788, "rewards/chosen": 15.0, "rewards/rejected": 2.375, "rewards/accuracies": 1.0, "rewards/margins": 12.625, "logps/rejected": -446.0, "logps/chosen": -286.0, "logits/rejected": -1.4921875, "logits/chosen": -1.796875, "nll_loss": 0.38671875, "epoch": 1.9, "step": 95}, {"loss": 0.4191619873046875, "grad_norm": 0.2695636678577293, "learning_rate": 7.027626604064969e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114564, "rewards/chosen": 12.0, "rewards/rejected": 1.15625, "rewards/accuracies": 1.0, "rewards/margins": 10.8125, "logps/rejected": -288.0, "logps/chosen": -222.0, "logits/rejected": -1.609375, "logits/chosen": -1.7578125, "nll_loss": 0.36328125, "epoch": 2.0, "step": 100}, {"eval_loss": 0.42919921875, "eval_runtime": 3.7095, "eval_samples_per_second": 1.078, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.8125, "eval_rewards/rejected": -0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.3125, "eval_logps/rejected": -223.0, "eval_logps/chosen": -7.25, "eval_logits/rejected": -1.5859375, "eval_logits/chosen": -1.8515625, "eval_nll_loss": 0.314453125, "epoch": 2.0, "step": 100}, {"loss": 0.4347412109375, "grad_norm": 0.23160497773181857, "learning_rate": 6.720476743745072e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113576, "rewards/chosen": 15.0625, "rewards/rejected": 3.765625, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -540.0, "logps/chosen": -366.0, "logits/rejected": -1.796875, "logits/chosen": -1.8515625, "nll_loss": 0.55078125, "epoch": 2.1, "step": 105}, {"loss": 0.42264404296875, "grad_norm": 0.32495458997078863, "learning_rate": 6.405771911037699e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113607, "rewards/chosen": 13.625, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -824.0, "logps/chosen": -332.0, "logits/rejected": -1.453125, "logits/chosen": -1.5390625, "nll_loss": 0.37890625, "epoch": 2.2, "step": 110}, {"loss": 0.447998046875, "grad_norm": 0.1698986411617881, "learning_rate": 6.08489404053159e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113793, "rewards/chosen": 16.0, "rewards/rejected": 2.96875, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -576.0, "logps/chosen": -414.0, "logits/rejected": -1.6328125, "logits/chosen": -1.7734375, "nll_loss": 0.40234375, "epoch": 2.3, "step": 115}, {"loss": 0.369580078125, "grad_norm": 0.2858000402116111, "learning_rate": 5.7592521739125726e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114561, "rewards/chosen": 13.125, "rewards/rejected": 1.53125, "rewards/accuracies": 1.0, "rewards/margins": 11.625, "logps/rejected": -286.0, "logps/chosen": -312.0, "logits/rejected": -1.765625, "logits/chosen": -1.828125, "nll_loss": 0.31640625, "epoch": 2.4, "step": 120}, {"eval_loss": 0.4228515625, "eval_runtime": 3.7647, "eval_samples_per_second": 1.062, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/rejected": -224.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": -1.6171875, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.296875, "epoch": 2.4, "step": 120}, {"loss": 0.4149017333984375, "grad_norm": 0.30453527889308935, "learning_rate": 5.430276272567485e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114475, "rewards/chosen": 10.375, "rewards/rejected": -0.59765625, "rewards/accuracies": 1.0, "rewards/margins": 10.9375, "logps/rejected": -920.0, "logps/chosen": -43.25, "logits/rejected": -1.703125, "logits/chosen": -2.140625, "nll_loss": 0.52734375, "epoch": 2.5, "step": 125}, {"loss": 0.4016357421875, "grad_norm": 0.4249608562032297, "learning_rate": 5.0994109383253506e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115111, "rewards/chosen": 16.625, "rewards/rejected": 1.8984375, "rewards/accuracies": 1.0, "rewards/margins": 14.75, "logps/rejected": -238.0, "logps/chosen": -330.0, "logits/rejected": -1.8671875, "logits/chosen": -1.8203125, "nll_loss": 0.4375, "epoch": 2.6, "step": 130}, {"loss": 0.37172470092773435, "grad_norm": 0.4079479376826424, "learning_rate": 4.768109069909307e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115369, "rewards/chosen": 11.0, "rewards/rejected": 0.33984375, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -516.0, "logps/chosen": -127.5, "logits/rejected": -1.7734375, "logits/chosen": -1.875, "nll_loss": 0.216796875, "epoch": 2.7, "step": 135}, {"loss": 0.39625892639160154, "grad_norm": 0.2037186119283533, "learning_rate": 4.4378254829551396e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.115894, "rewards/chosen": 14.1875, "rewards/rejected": 0.91796875, "rewards/accuracies": 1.0, "rewards/margins": 13.25, "logps/rejected": -580.0, "logps/chosen": -290.0, "logits/rejected": -1.8359375, "logits/chosen": -1.96875, "nll_loss": 0.55078125, "epoch": 2.8, "step": 140}, {"eval_loss": 0.41845703125, "eval_runtime": 3.6938, "eval_samples_per_second": 1.083, "eval_steps_per_second": 0.271, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -0.6015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/rejected": -224.0, "eval_logps/chosen": -6.5625, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.8984375, "eval_nll_loss": 0.28515625, "epoch": 2.8, "step": 140}, {"loss": 0.438983154296875, "grad_norm": 0.663146454101917, "learning_rate": 4.11001052161225e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114627, "rewards/chosen": 14.8125, "rewards/rejected": 3.015625, "rewards/accuracies": 1.0, "rewards/margins": 11.8125, "logps/rejected": -532.0, "logps/chosen": -482.0, "logits/rejected": -1.84375, "logits/chosen": -1.8125, "nll_loss": 0.55859375, "epoch": 2.9, "step": 145}, {"loss": 0.337396240234375, "grad_norm": 0.35248744382044206, "learning_rate": 3.786103689779861e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114227, "rewards/chosen": 19.25, "rewards/rejected": 3.125, "rewards/accuracies": 1.0, "rewards/margins": 16.0, "logps/rejected": -235.0, "logps/chosen": -348.0, "logits/rejected": -1.8671875, "logits/chosen": -1.5546875, "nll_loss": 0.328125, "epoch": 3.0, "step": 150}, {"loss": 0.41593017578125, "grad_norm": 0.22136084702555914, "learning_rate": 3.467527329945026e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114508, "rewards/chosen": 16.875, "rewards/rejected": 2.21875, "rewards/accuracies": 1.0, "rewards/margins": 14.625, "logps/rejected": -490.0, "logps/chosen": -444.0, "logits/rejected": -1.78125, "logits/chosen": -1.7265625, "nll_loss": 0.40625, "epoch": 3.1, "step": 155}, {"loss": 0.378253173828125, "grad_norm": 0.27404829822114873, "learning_rate": 3.1556803773799614e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114379, "rewards/chosen": 17.75, "rewards/rejected": 2.9375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -306.0, "logps/chosen": -380.0, "logits/rejected": -2.046875, "logits/chosen": -1.859375, "nll_loss": 0.365234375, "epoch": 3.2, "step": 160}, {"eval_loss": 0.421875, "eval_runtime": 3.7366, "eval_samples_per_second": 1.07, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.875, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.0625, "eval_logps/rejected": -230.0, "eval_logps/chosen": -6.9375, "eval_logits/rejected": -1.6328125, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.30078125, "epoch": 3.2, "step": 160}, {"loss": 0.35755233764648436, "grad_norm": 1.014697201946726, "learning_rate": 2.8519322171253602e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114154, "rewards/chosen": 12.3125, "rewards/rejected": -1.1640625, "rewards/accuracies": 1.0, "rewards/margins": 13.5, "logps/rejected": -412.0, "logps/chosen": -133.0, "logits/rejected": -1.6953125, "logits/chosen": -1.7421875, "nll_loss": 0.150390625, "epoch": 3.3, "step": 165}, {"loss": 0.3598182678222656, "grad_norm": 0.35082819529806103, "learning_rate": 2.5576166707349385e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.113875, "rewards/chosen": 22.0, "rewards/rejected": 3.0625, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -141.0, "logps/chosen": -498.0, "logits/rejected": -1.8671875, "logits/chosen": -1.53125, "nll_loss": 0.34375, "epoch": 3.4, "step": 170}, {"loss": 0.3215118408203125, "grad_norm": 0.4565042851234479, "learning_rate": 2.2740261391866637e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114047, "rewards/chosen": 15.625, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 13.625, "logps/rejected": -430.0, "logps/chosen": -306.0, "logits/rejected": -1.546875, "logits/chosen": -1.6015625, "nll_loss": 0.439453125, "epoch": 3.5, "step": 175}, {"loss": 0.40310821533203123, "grad_norm": 0.2804732100108321, "learning_rate": 2.002405927680374e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114563, "rewards/chosen": 17.75, "rewards/rejected": 1.734375, "rewards/accuracies": 1.0, "rewards/margins": 15.9375, "logps/rejected": -232.0, "logps/chosen": -426.0, "logits/rejected": -1.9765625, "logits/chosen": -1.7890625, "nll_loss": 0.50390625, "epoch": 3.6, "step": 180}, {"eval_loss": 0.4326171875, "eval_runtime": 3.7191, "eval_samples_per_second": 1.076, "eval_steps_per_second": 0.269, "eval_rewards/chosen": 8.75, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.9375, "eval_logps/rejected": -230.0, "eval_logps/chosen": -8.0, "eval_logits/rejected": -1.6484375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.34765625, "epoch": 3.6, "step": 180}, {"loss": 0.35445404052734375, "grad_norm": 0.2935390673249648, "learning_rate": 1.743948777242814e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114484, "rewards/chosen": 17.5, "rewards/rejected": 2.296875, "rewards/accuracies": 1.0, "rewards/margins": 15.1875, "logps/rejected": -183.0, "logps/chosen": -412.0, "logits/rejected": -1.8984375, "logits/chosen": -1.671875, "nll_loss": 0.466796875, "epoch": 3.7, "step": 185}, {"loss": 0.35137252807617186, "grad_norm": 0.3456175313601738, "learning_rate": 1.4997896271528739e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.11489, "rewards/chosen": 15.875, "rewards/rejected": 0.67578125, "rewards/accuracies": 1.0, "rewards/margins": 15.1875, "logps/rejected": -552.0, "logps/chosen": -246.0, "logits/rejected": -1.828125, "logits/chosen": -2.0625, "nll_loss": 0.53515625, "epoch": 3.8, "step": 190}, {"loss": 0.29118738174438474, "grad_norm": 0.35528088328631074, "learning_rate": 1.2710006311864104e-05, "memory(GiB)": 57.7, "train_speed(iter/s)": 0.114611, "rewards/chosen": 18.25, "rewards/rejected": 1.609375, "rewards/accuracies": 1.0, "rewards/margins": 16.625, "logps/rejected": -496.0, "logps/chosen": -348.0, "logits/rejected": -1.625, "logits/chosen": -1.5859375, "nll_loss": 0.33984375, "epoch": 3.9, "step": 195}, {"loss": 0.3332973480224609, "grad_norm": 0.5975495665409197, "learning_rate": 1.0585864495652897e-05, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114365, "rewards/chosen": 16.125, "rewards/rejected": 1.859375, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -508.0, "logps/chosen": -270.0, "logits/rejected": -1.5078125, "logits/chosen": -1.59375, "nll_loss": 0.310546875, "epoch": 4.0, "step": 200}, {"eval_loss": 0.43798828125, "eval_runtime": 3.6821, "eval_samples_per_second": 1.086, "eval_steps_per_second": 0.272, "eval_rewards/chosen": 8.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -229.0, "eval_logps/chosen": -8.5, "eval_logits/rejected": -1.65625, "eval_logits/chosen": -1.9296875, "eval_nll_loss": 0.369140625, "epoch": 4.0, "step": 200}, {"loss": 0.3664663314819336, "grad_norm": 0.3331703215396951, "learning_rate": 8.634798372847148e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113733, "rewards/chosen": 15.5625, "rewards/rejected": 2.046875, "rewards/accuracies": 1.0, "rewards/margins": 13.5, "logps/rejected": -556.0, "logps/chosen": -324.0, "logits/rejected": -1.6875, "logits/chosen": -1.75, "nll_loss": 0.34375, "epoch": 4.1, "step": 205}, {"loss": 0.3193946838378906, "grad_norm": 0.3665381220661868, "learning_rate": 6.865375481914016e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113959, "rewards/chosen": 17.875, "rewards/rejected": 3.109375, "rewards/accuracies": 1.0, "rewards/margins": 14.8125, "logps/rejected": -192.0, "logps/chosen": -255.0, "logits/rejected": -1.7734375, "logits/chosen": -1.7890625, "nll_loss": 0.31640625, "epoch": 4.2, "step": 210}, {"loss": 0.3307832717895508, "grad_norm": 0.424301607874042, "learning_rate": 5.285365727986707e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114016, "rewards/chosen": 13.0, "rewards/rejected": 0.002349853515625, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -892.0, "logps/chosen": -149.0, "logits/rejected": -1.5234375, "logits/chosen": -1.8984375, "nll_loss": 0.23828125, "epoch": 4.3, "step": 215}, {"loss": 0.30391464233398435, "grad_norm": 0.22292040670451319, "learning_rate": 3.901707263589671e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113876, "rewards/chosen": 11.8125, "rewards/rejected": 0.1201171875, "rewards/accuracies": 1.0, "rewards/margins": 11.6875, "logps/rejected": -780.0, "logps/chosen": -81.5, "logits/rejected": -1.53125, "logits/chosen": -1.796875, "nll_loss": 0.1484375, "epoch": 4.4, "step": 220}, {"eval_loss": 0.4443359375, "eval_runtime": 3.7528, "eval_samples_per_second": 1.066, "eval_steps_per_second": 0.266, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.0625, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.39453125, "epoch": 4.4, "step": 220}, {"loss": 0.34355936050415037, "grad_norm": 0.30310486108061985, "learning_rate": 2.7204760217631074e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113594, "rewards/chosen": 23.875, "rewards/rejected": 3.171875, "rewards/accuracies": 1.0, "rewards/margins": 20.75, "logps/rejected": -197.0, "logps/chosen": -524.0, "logits/rejected": -1.8828125, "logits/chosen": -1.75, "nll_loss": 0.55078125, "epoch": 4.5, "step": 225}, {"loss": 0.3126819133758545, "grad_norm": 0.3741919235199971, "learning_rate": 1.7468590353731495e-06, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113544, "rewards/chosen": 17.125, "rewards/rejected": 1.8046875, "rewards/accuracies": 1.0, "rewards/margins": 15.25, "logps/rejected": -676.0, "logps/chosen": -252.0, "logits/rejected": -1.703125, "logits/chosen": -1.7421875, "nll_loss": 0.35546875, "epoch": 4.6, "step": 230}, {"loss": 0.2852222442626953, "grad_norm": 0.47750186161501185, "learning_rate": 9.851316597681958e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113578, "rewards/chosen": 13.3125, "rewards/rejected": -0.8515625, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -592.0, "logps/chosen": -179.0, "logits/rejected": -1.640625, "logits/chosen": -1.875, "nll_loss": 0.2392578125, "epoch": 4.7, "step": 235}, {"loss": 0.32334194183349607, "grad_norm": 0.28821358712762607, "learning_rate": 4.386387988014273e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113901, "rewards/chosen": 19.0, "rewards/rejected": 2.828125, "rewards/accuracies": 1.0, "rewards/margins": 16.25, "logps/rejected": -588.0, "logps/chosen": -380.0, "logits/rejected": -1.9453125, "logits/chosen": -2.0, "nll_loss": 0.578125, "epoch": 4.8, "step": 240}, {"eval_loss": 0.44677734375, "eval_runtime": 3.7327, "eval_samples_per_second": 1.072, "eval_steps_per_second": 0.268, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.75, "eval_logps/rejected": -229.0, "eval_logps/chosen": -9.3125, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.9140625, "eval_nll_loss": 0.404296875, "epoch": 4.8, "step": 240}, {"loss": 0.3428853988647461, "grad_norm": 0.3432327045772877, "learning_rate": 1.0978021666005478e-07, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.113824, "rewards/chosen": 17.5, "rewards/rejected": 1.921875, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -476.0, "logps/chosen": -456.0, "logits/rejected": -1.8984375, "logits/chosen": -1.6953125, "nll_loss": 0.470703125, "epoch": 4.9, "step": 245}, {"loss": 0.3745833396911621, "grad_norm": 1.6712453590352487, "learning_rate": 0.0, "memory(GiB)": 69.46, "train_speed(iter/s)": 0.114113, "rewards/chosen": 16.125, "rewards/rejected": 1.59375, "rewards/accuracies": 1.0, "rewards/margins": 14.5625, "logps/rejected": -386.0, "logps/chosen": -332.0, "logits/rejected": -1.90625, "logits/chosen": -1.7890625, "nll_loss": 0.380859375, "epoch": 5.0, "step": 250}, {"eval_loss": 0.4462890625, "eval_runtime": 3.6978, "eval_samples_per_second": 1.082, "eval_steps_per_second": 0.27, "eval_rewards/chosen": 8.625, "eval_rewards/rejected": -1.203125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -9.25, "eval_logits/rejected": -1.6640625, "eval_logits/chosen": -1.9140625, "eval_nll_loss": 0.40234375, "epoch": 5.0, "step": 250}, {"train_runtime": 2192.7386, "train_samples_per_second": 0.901, "train_steps_per_second": 0.114, "total_flos": 720119600250880.0, "train_loss": 0.5198925256729126, "epoch": 5.0, "step": 250}], "memory": 69.46484375} diff --git a/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs/events.out.tfevents.1737734300.kml-task-547024-record-9965643-prod-worker-0.54493.0 b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs/events.out.tfevents.1737734300.kml-task-547024-record-9965643-prod-worker-0.54493.0 new file mode 100644 index 0000000000000000000000000000000000000000..b5dde059d3ef8eb149e4287a3282d3101485c064 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-155645/runs/events.out.tfevents.1737734300.kml-task-547024-record-9965643-prod-worker-0.54493.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28ff2d32409c61783711e26e205ada5fd4bd7afa1144a7c67748a950ebc5696 +size 61664 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2c7e0b59dfb22b41b41552798e08b13ad240194b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc91aca622d78fb2def853a41c19aefeee808e69a3275170006818fe433f8819 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..569a288f0d5520535cf1bb85edeb724c4bbb7444 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d9df69191379cd779c338daa7d917ff52f64bf6749d2b6fb3d640800e24a23 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b33aede02509addfcced8c7b89f8396117878df0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22549d210ed712999cf4366bee9c703e2afc01992b177d9ad23bf6b638463192 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d836cfcf42cc61d8b08a2cd118e2f80da8670d0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4cd6c5bdc7315cee5f19d0df0e5efa8f38ac5af422fa84c0a6ae2c2efed338 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52ced3d47f2a7696ab033a57001e4c015352c292 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24a84837fb64bc2dd9c3c919b77da55eb9d6b35b0fc57d2aabd6395bcd6e5b6 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..568fc45bbea947c8812c8e25db23ec9855893328 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56bf885d3e9e0d5bf4b278b337f3d69d581c9170b9fb5158cd3676e8fb4bfb7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b085e6d720141b47d1acb1b42994eff726c1369 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be6305531b0bd924dab6ebcb38239b4696de37fce336246e1d5ffe420575a299 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..090550626d1153701385076f07dfac244a46c345 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc6e308820591f18dbf387df54eed293c3381dad3cec9634bea1a8323bb2205 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65e7002ae67bf9745831829c935fc02809e79531 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4045bfbe0d3cfbf0c4fdc837da819ee07c047c0651fb3292147a8668c004e20f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f822501d5822397820601b52ff19c5d7164b4295 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abeee0f28b3a7ef55b5c677b5cd04c62d917bc78a6582decbe1aa447e160a2f5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a46b5e6da55796b08c4691a6f95361cfa8079613 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0f6092ad4b400c3a6ac5abe25dd94720e4ec2ecf8310c5127abdcaae873da1 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1607942f64be77f4685e08b0060a97e01c5580d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ff5de98f157f7431052e5abe06c8e2ab4b42138fd91c88826cb56daa9169b4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c07dd0b3211ec242c9c70c75637615e4becb8d53 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104289d59ab4af2627c4cc06bc9aacbd3faf772d070c27fafe6161297f7e626c +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a0697106a13a064acf5fd016458c96aa42efb9e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82d35c567a72766be728a58b2b08eadace3eaf54b03f59b8ed041285b596005d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf6dde6f85851ffabd0dea5c365203d294aa920b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3d4908a56f06e3481f6b86b199a8834c11c93a761d0e8f5617b81fce72a990 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8828f3921e32ee5e5f7dfe51ae199ba3d78d1d65 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6088d952c20a745cc58c2f0a27239d92f4bc9d15366ef61dd6f131c0e809669 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9c2b8558f08875f7c436ae4222aa0725aa770ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3723ddeb824d63b389a6f562ee4b19f093efad28ce878f8e143cd082f80f0c66 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e44eb3fa45a557460f92c098f2fd1eccee781e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e2e2737d63e3807ccb5977a1e9e37b3ec45ffad7bcc727e9e137aebc01368 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..75cc99e568438a015ba88404a80a7d10b281196b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 548292278091776.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bbf1636a62e23354441efa414c9c7b523036618c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e251962e7653540f13ffbaa5b4161b7920d31bae36ca1ddc77ad33a11b648353 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be2d1b764b865604b7d75655f7431215d7470834 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49353a623d8029a64e20ce557b7e5aaa8395dbac36de29047a5a4281acf44c78 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c48175914fc9abe2c1d2fb0282bd306da60d086 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54090ad50d95e8f29cfd8e642a80e0414ed8b0e629f231715c1cc5a014b56f7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe8a506dcaac0dba8e0ffe1881eadc7475092b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8301a5012d8faf8e65d0566250ae8cb26e6933e87b524102b9cfc3e1236d6233 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8da2fd3e859064ce33431b9b1dd76ad8559e558d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eb6b991f22ea0024017008e493c7067f1de36a3a2b8b1473c4dfef58f5cc2bf +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7426e1ffb3d9495701be3a192c555a4de86bac04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944b90607a943033c526db8b31cd7da0d79997b0a50565dfb88bec09d200b5be +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2581338038de0c4b8fd062a9423554432dd8d96 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0834b13e536c70e5d55bd208a6bfc9a1332a248708da0cde10a757b6141feae9 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e82d56572cf33dbe1aaebb5e2a04bae7bb3ac18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:947d64687fff15e127b9ca956d05c891f60328d7901bbffa3f8f3c7cd1e79dd3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0db73b2b821575b24b9bd782931671c6c2e4e498 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d53d9e5a7b0aeb580fc6354233f4dad680c3e1fe7b97684f14ce881a7ee0757f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8eec0fe9ce19cd23cc07f36a4ca8cbae86e73cf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e3510aaacf5039e060f4468f2f2d158717c1589acfeef61a474c7d8c6c152bd +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99bd15063823ba5bd4accc2313a867c5150fd69b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74adc72ee1cc40690fd1672e4922db0f1c44c2779468bf6c4bd3b870c4f5878 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81fea5e70392376c852ab4576d92a179abdd59be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b43e495bc25ef506c084ed733c9d59b39109254779ee59d468379bbad4ff015 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b8de3d27d90a19dd09d9cb708ae955939ccc89d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ca483442df2ad58f2499ab667e4fe435e0169508fed23221e02618231adb85 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be4410f8f8784df78d5e008d82ee932e8f51fdaa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d316ffc88b7c3087d488ec9b0b66f55d478459a7c8d9627ea7e1479e640e8ef +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9578fc039e2b728d747d02b0caaccbdf17fae347 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac79909caa2f128bf738bf5e201d13ad4a3e545f1fd4135394df9518dd28ea68 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3043c5723ed13952cea6aa12e3d29a862d235520 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:493e4447686beccf07728b65cc6d1b13fc930094c8d6a60d70acf0411d904f35 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82d5f2c61fa6d024d1976c542aa87b4bee87690c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca8396e32d30ca7b538bf4b044dc5841b959e460a378d73d78f64ac1c7844b2 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..0e13e0563ec45a863d519305a1251d3e72b9e3e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/latest @@ -0,0 +1 @@ +global_step120 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056328cf4dbfbdfaf5b7ffa668b29852f77a3798 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b76da7ccfd8d1a286433da6127628e0c6a1565950b2dea51fe5864ad3e6545 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9eee166ef49a04e34c82ad164c679824bd93f281 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 2.4, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 656871851032576.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..434760c98de32861d98bfe32fee660c3c169116f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea107f4d7a523a6a048f474e31cc28d3ac6aad013800c1712c313dae115cf59b +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7aa3aeb45419a518625b3b053884863d50eb34c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c14594f105e75709669adc9f6150fd5afa1b1a763a699a0ee2a7bec969da354 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab9b38fe1520b5c4c45277133d0296117c54a813 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e61eee500f59be816cc03eb84ae2bdc15fe0c613efd6b6d378ff5d76c0ca86 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f2167aaae94c4fc6e3ae82a0af2146ab7a41b26 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:645b69724533736e2ead74f1f93503db4265fdb8496d9d4fef3a6a46ea3d6266 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b7258cb7a357f112fff6f96486edfce4e2c006f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc14b37432ff5d5f30abb2b494ee2da9b69c9ab77043e5d6345f0f0bcb2276dc +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df41f9e9d95e3bcef605c503fad6f630a4b4f36b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5910c297c2172842ad6a9bbec20eb9117f5a99d36deadfa59a03eb5687cc6c63 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06340d95211ad68c267c64adc05c59f00609bee1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7600cfce506a1436c30888e03ea3c9119877a2c88e67f1bf6001b4565d53b8b2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29a1a8d76c6765d7d428c32333210e0e55cc2a08 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88910762494faa686e9f2133f93a7fcdc051615680237a65f18a59da8141c4f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b35b2248edc7b41f70421ed9d5e3f4dc61446c4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0efd784bc5d55a6e773b62010445833156d187c5e2ef2b098becdde1fad160e5 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..556a2447dff299500829d6b07996405b5aa7ef7e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8db97381f0908046f9bfbb7d15408d7d57f7f8ae3ff2acb642fc60c91c5acc4b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d63c33b6beebfc35288a6b3959d2dec81c03109d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08e5f8f2504e31f12e5f1a8bd898d18835a04c1afde31d47dffd0064a376b086 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29165b01b8cf74a7b098ed230b9dc24091dc4c62 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ce0adda744ba5ce4586f31a72963d69143696af0be7ab81247c93ee06d71a5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df18699badc8a051ebac62dacecaaabcab780ed0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ebc38a52c4212f0c8f2610340bb32ec0a02f72a74a94e246e42d4e5c060a01 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddee5feec04fc29a3092865fd8d67afe8d2dddc1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14e58d4f2035e166121994f82a2faf987802d9b14dafeb2ff470abfb828bd25 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f22d48c8c5042ebcbfafed4dbbe239eb0e78464 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c4effdbfe2b03effdf04975026a1cb3f636a47a4b22642998e63b6f87cd1179 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5bc9448467ce35df91a3f5b31e919ebf07e5e03 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6840509dca0a783650cc30553d3c3475540fc870d6780d3a4bf7660a2116ba7b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eb44ee2645a92bb28936d7fbce53298cabf913a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db628821ba422c19bbedfd8669d18113a8e66d2a2dc90cf1702314ea54a94f00 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd2b9aef86529798137c2868d556e873a23c785c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/latest @@ -0,0 +1 @@ +global_step140 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6e613ad02e1482b1eef52ff51329fe67d4fceb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9c57c64e42f5d7ec5b6fd8bf14122cd4f49a4ae907dcde9c057b79cc82e639 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1bfaa14232cac8bee00e1eca34caf5a744894e46 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/trainer_state.json @@ -0,0 +1,674 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 2.8, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 762514492620800.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-140/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2ef29df38bc6e3c1021b3075cea3b52b5a028f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3001b1886685b38f54e0bab5e4d0662693479376d2f47f3be3cd65210f0f7d2a +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7c861f2d2682e2509db200bc5ddcbee76fb44bb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8ae0fc023dd3979885b7c77c52f0d32d0446d491c5e4dbb23c71e8e27d6386f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ab83eb96bc2cd48b0b79926769796f67647ab77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d81d86591c0dd69262aff408277e2f0c4d066f4d1caf6919bfd0bfecad92999 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf612ad34056aa3a03dc5306b6c9659615d37da4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5746d388906744bf3a694b5f1376427047b4f9ef36022f523404f7ecda89fcb +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1df8404b7a7287e07c7e4a832091d0fb4f5946cc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e4e1102c17b86b0d37279098cb253cfa450c0b7279f89a2e9808b61d7b6e7c +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dff1b90eb9b8c37965d91166174e44b7a48040f3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6210220622a6b66955001e1da4f090d7185095512de0464b232ccfee753e770 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3accb9e2553653297e6fb9a08b1bd56a2024ebc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff7926264181fbc3b08087bc8671fefeed728d30492d27082c92a6baa5acc1d1 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0694e654c847feeeb23accd5a248ebe1b2ff734d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac714119289d25306866d4c32dc58782f2ecd36fbeed0b3459a2afcb498c1de +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cea5fc186f2296fb5476a21967bccc55e56d24d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8afcd411f985e53d11b0f688f78e1b80473cbfeaba80373a6aeb5cee74c1ff61 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bb52e204936712e129cc4c08a639498a951b866 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac0a1ff002b7ea57be0111c20ae68ecceee98a9d236a4ac21e0c202f49b2d9 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2d07e780c1bb54bcf619b05c2df9890dd71884f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e4c6182b348c4107c85a615ee8ab650f16438b09d43847db76396d4e06c17a +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87ee5a07bcea5649bd44a7b7303cca9786048d1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5503c0901f4ed912ade9bcb7a3171e70a9cd00e24340eca823d4852a9fcd8cdc +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..078ae1035963b75da6d0a2a49210007be2f4341f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3305c3d11a96ed69f209f9daf4be77c2326880fb1f6f2cc97124c497244a45b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc62e5c3a4895c94b22bdb8eb8c2603b40f6d732 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de098f1b54f7348e1aad8356ceb37c1dd3f67f67c2f8ba01f449c578d60b7529 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da6a5dc0611317b1db4432319d2c4976e99b904f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f0ee0811acfabf3c6d8b85fb6d806ed703aef221ca46526291c246849aaad7 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a84a371b19317aa9dbf040ae16e79435b6ab252 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0927ab4c9f967ff57e6bc4646231e44857811342421721c657957d99b89b2874 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4f88c6a36b34c4b013e016d8ce35099bdb76a23 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5ac3d3a9dba2878ff573133f68b2914dfda959f871c882380eb0c01ee09786 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/latest new file mode 100644 index 0000000000000000000000000000000000000000..3df30ded267d950ff3ca04cffb9660be12079ca6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/latest @@ -0,0 +1 @@ +global_step160 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e31a2394e12bf431ae13288c3d90fe4727f07fa7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6462d333dbc5bb5e497ea9b0adb960f7616f79e6eea63222de6d5bd559516 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1db0a0f44aa3ac1d82c3bf8dc2d8968eeba4ce7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b045e1bfa728f51c8b51ab0faa20b128a4fbd350da006b9b39a19e24abdf5a74 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..75de18f57a056bd6a5f89df1abd045678f3f919e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76a3d058d2628a61848c2441d313f251278bd8f74ce43dc44d8cd8ad3e619a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fd100693bc9f3267d044ce4a16e702502dc03ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f72fc498e6eaa671cdc0e8a627a668b8ef607063a22ddb4edbc05e791be830 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aeeabfe119f1cb0c8c804f1b9a4d3049f478d69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12889af98e175b734a788f4c5b8c4da91dd61ff3a05aaf61b9d4c66aa3dd8ad6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fe0f42382ab06f4d26d753745a914c9e46100e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe21a86abfceeac2cf2f48afd61a9a506cf61a287f3403f1adf391bb2ffa5a83 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5830ca6bd04645962b6e56a00a91cd8349ca449c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73488bec91f9dee6d8105d06f99edaf4d27b6b064250d4c7023f33285b2f3132 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..343d1c0475f0dc64100dc67b09195e047f1a7bcf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf6ee1cc2e1325b428a21172ec4e61b7220c5489751ea11c06bb66c77a0cd08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..229789af83e72e748f236450e9d2df977318d98a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b659f5e1f39ab526587d47a9d305eeca96cdb1335d25ff0a7b9958f685604b4 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6875efdf5c58f4f68d1ca1fa4b3852ddd51ed498 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/trainer_state.json @@ -0,0 +1,763 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 3.2, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 881343117918208.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-160/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6263b0ee4399a622475f1849eb034e0b16cbe01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91358f608a2b897a764e7453144d68c9d2bdc8b7e2f47e92bc9d167771f23a31 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c4c3bb9c038fb0ddcc5bb0c674925b7d25b43e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d5e68f96f7fbbe4eaed96bb0dc5aa91cd89d1448a5f7c4168a4b6ca6c2de78d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cd94a9ffabc2c7787cef13e0009932b1ae0924e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d230ad3321d7b02257fe6207cc4741de90e6c193740ff5a85453499cb5fa3fd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ce4a7e69b8815f90b94a29301c151e6b2f44964 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92e0ff808e485057109c3f68797f0b0c97f0cdd3d791152bd95ba1b8a9922231 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8964143fd7da849ae543155e5373b16dc63d7908 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e598f445656fdc4167e8f829363fb0951d01608749b42c209d06ea2d584155bd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1aed8f91085ef3681083ef1bc7bbdd490b776317 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b80d889df5fce0b7778bf1886f7ffd8711c228fda77892a8a276b7bbdac99160 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdb987dfa3454c1e95d40d2c5f8fdab8a4219435 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:702ff3c582f693c8ff2d116161422dbb7245c3bdafcdafe219fd7c7265d71d65 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b243b03770b823cf7cc59a4ec92a47df76eef1ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09837952ef8acac15725433270cf54e5867f7771c1588e5be7d519ea683e9ff7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..302402c5a4570ead540b504b9bc0d1e9170e8670 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2547dfae2a9f184961d66a3ff483330c79b413c80b4cd0c49ee276f7bf3bc906 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2932a8773c16b6882e1419d8469957df90af1997 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18181ab077dda4474164b6d499e3f674797f9b6b82cadcb04aedb31aab88d290 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d78a85a66ef239c8b3fb8a531c558de71e9c72b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:270207ddc5f40f1c24ac22b40b5693f65c3476ded9e9e656f54628f211221bbf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f6c30dca8b331694f63d796f11ef8b636eb3216 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f6985acc97513eb6d5e31edb03929f8373ca142aa960eeba2518ec6deffaee4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4808a28899a3d1edf255d9b2f1d9b1807a045730 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a1b45cdbb1e21d261f12f8c1ed8dd9f08ac2fa49f2344f97d1b28056e59bef7 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e13e050e5de104e653fd50cbe84dd81c07982fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c785f1969585fdf1c0a111687ec9da930fbaecbf76efb9d1a48e31e8feb8bdf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..756f0b04cd2a8c39cc5fe5c5da49513d6b6ec9c8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70365c55ca295b1ad744259f1d077790d9a5e113116b01c8c2e36e8ff78964c8 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da9049faa15e3b40ff4863f9162b59729aeb0d97 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11403d6dd013ec425d5d5997d4819527449778b16ed1b551419e7374f0c3e50c +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f51cd9c27d4398c68db9cb5a4b59ddd0cb747112 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4a0e692d2fb2d17a829f3b429299bf9bbf4d690eba4ec8854cf7b9cb26489e4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/latest new file mode 100644 index 0000000000000000000000000000000000000000..eac7d625396c2750025575c77b8da5d622b0c7dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/latest @@ -0,0 +1 @@ +global_step180 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f51b498d48145bd9cc14b35f8236b9ec95a4f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08e59ac81067b262a084604cd3392250166c2841 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20a24c17b4be2ee59cd5e6682010519318a91e58 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..54050f6cf8fb847e2a926e14a7aad2647761521a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..263aae475c49b090bce43f143308192c5bf9a95b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..942ed5d60ae87dce686b33da76a34db404036dc6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..57789be3df3983cb8acc1500bf6470ffadb1c578 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32d6e2e7eb7148713b473b0c821a98e616ab6e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18942cfbbbc36710e196a20b862a745c9dcc2468 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fa6cf7ac608af8ab72180ce60dcfa61b0bf4eeab8e185f70f65a95b45e6b7a +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..12ee877a0fa9e555dacb0ddfcef386334c7d21f3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/trainer_state.json @@ -0,0 +1,852 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 3.6, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.7365315552744873, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.35546875, + "logps/chosen": -120.0, + "logps/rejected": -520.0, + "loss": 0.26008996963500974, + "memory(GiB)": 77.63, + "nll_loss": 0.10498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.75, + "rewards/margins": 25.375, + "rewards/rejected": -11.5625, + "step": 165, + "train_speed(iter/s)": 0.064049 + }, + { + "epoch": 3.4, + "grad_norm": 0.4367548710430112, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.1796875, + "logits/rejected": -0.6328125, + "logps/chosen": -466.0, + "logps/rejected": -171.0, + "loss": 0.22614412307739257, + "memory(GiB)": 77.63, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 24.75, + "rewards/rejected": 1.578125, + "step": 170, + "train_speed(iter/s)": 0.063846 + }, + { + "epoch": 3.5, + "grad_norm": 0.5745096147296118, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.56640625, + "logits/rejected": 0.0125732421875, + "logps/chosen": -255.0, + "logps/rejected": -540.0, + "loss": 0.24147272109985352, + "memory(GiB)": 77.63, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.625, + "rewards/margins": 28.75, + "rewards/rejected": -4.125, + "step": 175, + "train_speed(iter/s)": 0.063902 + }, + { + "epoch": 3.6, + "grad_norm": 0.3506997416185766, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.306640625, + "logits/rejected": -0.6953125, + "logps/chosen": -394.0, + "logps/rejected": -284.0, + "loss": 0.30578501224517823, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.5, + "rewards/margins": 28.0, + "rewards/rejected": -2.515625, + "step": 180, + "train_speed(iter/s)": 0.064229 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.302734375, + "eval_logps/chosen": -6.96875, + "eval_logps/rejected": -318.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.625, + "eval_rewards/rejected": -9.3125, + "eval_runtime": 7.0569, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 991931252867072.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-180/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0dd71a7342dca94b92978704272888ed220dd8b7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:277f80dfa4d08208e6da6d7db12af922e0bf10aaed5c53898bfc61159b8b107e +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5e495714b0513cc0dea577c389aea60e5476b8b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aea72ef2a79c4773ce1c1b77b8057fc76c09beadcff8a9ed6d758f522411a85 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e26fd65644ec6c20e63d7926fba95892079380b9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459d8be22d6f719bae12b063107922cfc23d44a2b838538fb1b5762650382a53 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93088c97eab2bf0ede2abbffa2b94af406b520ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655741c90a4df55c3d2372bb0d05bb3f30fac5a5186b985cfd2af8fbf46a489a +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d80bb3aa0f6ef52fbb5ac3e2d3e44849e77b3bb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d0e576db665b4d64cef5155fbc70a773f175fb31cf2dbe0e5dc2dc0a44fc7f3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b839eb9f660355138c6a0864a39fd76f737c1afe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec1e8b03eb93f1b17be5a3fc945f52b5edeaecc42a2f30d006b92b66e59510a5 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ccb62d68577ef8bfc0c2c744995d92408f8ab78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d276d99ac10abd0b9b0351d90391287fec87a31929bae5b805e8e64e47edf535 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45eac5bc68f0d622fa8d13edb5f3ba43349bd1bd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ab62bd787e4c4aaddaf724400dcec58581743406711d13240bf0de0be6c102 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9953dd440404ee5302a61878da078a5a33a986ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e335ad2e950dc9b2d1e1cefa9a7a21e9bb90c45ab2e79d390b274239a57ba2b5 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8eaf7ccf268dfb240bfd65263abf79405ba5bb2b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6dbf78c01cc387486e57c3c52abba1ad23ed5241f67411abbc3ab6b91a4584 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79c09e4417f6875a7f2cce5411840d131b2fb94c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daab2258b699a674b288b6764ed890eb58482a9167bb6b4c96d0d32e0ce12b28 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..098562d1db99b6738150c2fe4f67439574b4ef69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac2ac815b4252f4e39fc34f5595c625aa8c39f27f95204cb1fa0a96a126538b +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1f8e0e6652e57269cce59ab041057ef92c68a1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a2cac023e2b2b8375b4aa8d456b2c9bfbb90a11ba27250c5951f5b0e9170e8 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e26ac12fdb1d0f4623900ededde1c22feb58f181 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db4f603e6de91f3092db4288f8470eb4a06cb0e7543ada42d10a8355fd3e65a +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddc87a4ae6e8e07cd3c9cdbed98d9d556a6345ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:010816564ecc7ec7910e2be92d0891b318b6b92e650476d60320f079bc2e4f37 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48498d137683134c3f096f8910da02b41d84e14c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c8096a7800cd6276ab955027401cd0fb8e708ba4ba83d843702759c53d98a92 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d40b8e07ce49da2d392a140a748830f0a6d23eb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8299a3869a7547d8b5bc3f3db39bb8df28f44685cfc0290eae514c12e006996 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2729ff9a97436d6c8ad743637f529065140ad3f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e1cf73eea4791075e839e628da180bf39e1e01fcc1630f4ac9c723d8793968 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..989d039cc84947296c4e4ae832b8070c9455e92b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.53662109, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20", + "epoch": 0.4, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 112495670329344.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d222aa33653d3e8aaa537a5775f10b605a36a18a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e6673497dcbd29506209ab0ae9dd04e0b6f49f75209dc21c78e163d2dfa1ed +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2898766b42ea71f2cef6dba5dc5e40fbe074b31d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bb6fb97694e24cb9a2c9f0a6908386cbd80fd32765169151dbcd11f479a9efd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf755c5af331e9d804fb87fdf7440d65d1167205 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:289686da3dfad6eee5f8f5d2184cb211d7eb8b123688a56f5059a2013cfcb29b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f35b5c19bbff968ef0f505f987f5199703eee37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f667f9f53420cabf0979b9ee6f659cfd02178d71bdd9753494a44defeb49ed +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e421799db1eaf9db65bb596040e13578eaf00eaa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6eee557872cac8cf67374e423014b3172969b6c4316e0a205e73dd40f55baf +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe206e41e6f3c4246524215a3f14b39b686ca6bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbafb6ecd87b6e30f6148a5e41e06c4e09503fe2d97fbcd9d748992dcb5b6046 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f59e6f20df9b1c8be1df9ec9645c6beb2d10908e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08655bff102133c2e849aa30120e298184232ecadf582da63e8fe3fa8f7f609f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3df1adeb614e671ae5b6907728cf5faef1633596 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a393bcc7fc371af89e7c6b7c2ff730a60224f4c85fed3f057ee35989ed6d585 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cdbc514c79a75dcfbb0ac18ec5a7ab9e14d79f5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33c49ece9a636503132f5bca4b73446ae6983c8441bfd19c7b6131b4a5b8511 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9386fa2a8228b2e5f2e4395e10e2712eeb7dcdb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede5c98e820d7441d5ea4dc7e02076da63a510c8165dd959015e40448b83505e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0b3f9c11b7795d3bb38bd8b60762a5029a231dd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce39ac0edfd96a8e0cb4bed9cad9832ed0b916bdd0ec96eb162e04b948c98b8a +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6348897346ea645c55ee5e7f01f60343ee1e492 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58808634b5798ff1df1b6e3d83a8f80e7e95f65c061b6290a2240374354178b1 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b931de2c3b78efb6a65d776fcd09849be9c1e333 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1edd4597ea0bf90f18b4f2b33792ec5bfd7bd196d316203dd88333b6fad63e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fd64dcc861c64b094e2a78e4d556a3c8da2f0ab --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8efaf9ed02434e43945d9f7f55744d5d303e81c9aaf7d54e3fed025d668fee80 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..133c49fd0718a399dce3ea2825985a5238963e3f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acef0c26c4467202cc5e26ed2b86fb1aee58884addcca6a35145d04b1514139b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..215a238fa5e89d31c06bed08c3b76b5be797c446 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f382e120462df0af9d45c97312af7f6df26e2b4c9c44a32ec78707f2ff9d2e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70716ec87cf9810cfa1dedbd709a42e3eb31fb60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683e06a81dc0cb8a7b54f36fb551c94aac8fe90fbe58946edc7288f10650f1ee +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/latest new file mode 100644 index 0000000000000000000000000000000000000000..753e24e10f3a2489150f458205cf759fd8b6081f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..584f4a4a43f100f35696d7314a633631af587f25 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891ffa7c7dae99113aa986d67278b52b8c57db55001dc3547a61f24569a34ee +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..05b027a867e5e9cebd446293ecff82cfb240cc76 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b92875cb04deec367605433847d1bda444b178b643d2da7ed9aaf738d232b4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..af98f0dfe2a5d89fbccf90df58246a0b078c7016 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f5f3338a05e325b5408a1cd0b6f5e5b10fad05fe479d63f44bec4cf18107d6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..715aa4a4ee3915f810fc2bacb2153eb8a0913781 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be749fea477a3867d44010631937e0d8f071ca5f9614f9795c92c7fa68833a6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bde70899833455b6ee4a99aff9388abc5ffe92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc4a5ea4532c621f4c8e9891117b2e597a7f005001e8b4f2a1b4da8c82bf964 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..90cdeaa2fe438098e9d95ddbc06c765e51af1e78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480f9fe7dd71b54d915b46162e34b780ba2467d5542115cc809dbca60b394c0e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd30529614c5be239cd9477af6bef0e313740b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11d982dcd813e82c2d97a5491ce9624cff2dd22e8655ea617ccef1fc1474470 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bed311094effd49cc2c89237c675f56eade157d1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73494fac3a001cba7cedd097b97f028d4c1d136ee6709214b0a7fe305e5b9089 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b08896e3e64039017a0606b43a6327f1f78848dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826281cb7f404c3805b9798147d05074dd208eac748e2052087055a015aaeaed +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3fae518a2aadf3cab067cce9b7fe3762410084d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/trainer_state.json @@ -0,0 +1,941 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 4.0, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.7365315552744873, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.35546875, + "logps/chosen": -120.0, + "logps/rejected": -520.0, + "loss": 0.26008996963500974, + "memory(GiB)": 77.63, + "nll_loss": 0.10498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.75, + "rewards/margins": 25.375, + "rewards/rejected": -11.5625, + "step": 165, + "train_speed(iter/s)": 0.064049 + }, + { + "epoch": 3.4, + "grad_norm": 0.4367548710430112, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.1796875, + "logits/rejected": -0.6328125, + "logps/chosen": -466.0, + "logps/rejected": -171.0, + "loss": 0.22614412307739257, + "memory(GiB)": 77.63, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 24.75, + "rewards/rejected": 1.578125, + "step": 170, + "train_speed(iter/s)": 0.063846 + }, + { + "epoch": 3.5, + "grad_norm": 0.5745096147296118, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.56640625, + "logits/rejected": 0.0125732421875, + "logps/chosen": -255.0, + "logps/rejected": -540.0, + "loss": 0.24147272109985352, + "memory(GiB)": 77.63, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.625, + "rewards/margins": 28.75, + "rewards/rejected": -4.125, + "step": 175, + "train_speed(iter/s)": 0.063902 + }, + { + "epoch": 3.6, + "grad_norm": 0.3506997416185766, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.306640625, + "logits/rejected": -0.6953125, + "logps/chosen": -394.0, + "logps/rejected": -284.0, + "loss": 0.30578501224517823, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.5, + "rewards/margins": 28.0, + "rewards/rejected": -2.515625, + "step": 180, + "train_speed(iter/s)": 0.064229 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.302734375, + "eval_logps/chosen": -6.96875, + "eval_logps/rejected": -318.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.625, + "eval_rewards/rejected": -9.3125, + "eval_runtime": 7.0569, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.5026347543849498, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.13671875, + "logits/rejected": -0.478515625, + "logps/chosen": -360.0, + "logps/rejected": -221.0, + "loss": 0.262108588218689, + "memory(GiB)": 77.63, + "nll_loss": 0.41796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.375, + "rewards/margins": 24.875, + "rewards/rejected": -0.4765625, + "step": 185, + "train_speed(iter/s)": 0.06422 + }, + { + "epoch": 3.8, + "grad_norm": 0.45140535411962257, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.58203125, + "logits/rejected": -0.486328125, + "logps/chosen": -185.0, + "logps/rejected": -604.0, + "loss": 0.25142607688903806, + "memory(GiB)": 77.63, + "nll_loss": 0.2470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.25, + "rewards/margins": 31.125, + "rewards/rejected": -6.90625, + "step": 190, + "train_speed(iter/s)": 0.064487 + }, + { + "epoch": 3.9, + "grad_norm": 0.560007261391276, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.47265625, + "logits/rejected": -0.2177734375, + "logps/chosen": -332.0, + "logps/rejected": -656.0, + "loss": 0.2213657855987549, + "memory(GiB)": 77.63, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.75, + "rewards/margins": 31.125, + "rewards/rejected": -10.375, + "step": 195, + "train_speed(iter/s)": 0.064327 + }, + { + "epoch": 4.0, + "grad_norm": 0.2915492783630129, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.056396484375, + "logits/rejected": -0.166015625, + "logps/chosen": -232.0, + "logps/rejected": -668.0, + "loss": 0.2468355655670166, + "memory(GiB)": 77.63, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.375, + "rewards/margins": 31.5, + "rewards/rejected": -10.25, + "step": 200, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.3203125, + "eval_logps/chosen": -6.15625, + "eval_logps/rejected": -316.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.29296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.5, + "eval_rewards/rejected": -9.125, + "eval_runtime": 7.0303, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1100523974950912.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2de15d014d871e73fb6052fac487191a6ab0733 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d325b64d12ac30e3ef11dafe0c127c35cb408b4cf4e115e2f54287af73b1a25b +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9dc515e2ae67fd60918417336cae255beeaf7b67 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf1ed5fffbe042679c3517d15577e04738b88a87803a0913e6d26b85edb37438 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b56c91be08c7a22300e57c27ebf38303284007e3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c62adee88984fb588375014980bae14c41569d257e7794ba5e548f1543b04dd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7801ff3fb64714350a8b0196ef9df4a71808d4e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c7edd5ce3febde8dfef667838728289cdbf4bed07730a4eec3c6427762b30 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6eef71df43b6a42ba10c3007aae13aaa1743c96 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a7b69f0e67c6bf793de06903124f0fbfb6eef405c95a7af7e6f356428312ee +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..604738c875f2b830d38214731ebd9b08138d8079 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d04a952f4eeed88318d32b7d43615b0a124ea890aeea76495b1a694cffd16d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a7f4f58d5d80b0d7ea628503b55ddbf671ec003 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4389665cdab9d104b1e56ca2176cd33f40dab94506bbc7f69ec1ff2171534c47 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4acae027c146c0d2d5fbea702c41e7d90fdb79a4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47a2a494ba9f81cd131141615f47becf7c24bd8c7a0381b556e7c1e37e7ecd24 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be096e9509bd543d86259d3f73a03d599660be9a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:890cae584504786963bf2c45b2913f41723880b2009373cecb377ba755443be1 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4001549dead1ad26881eed12344ad9f1b53afd92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dd8000652c140749c5d6a4228b4d83ab9fe446e45de13a198d258708b2b462d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d5899a85b9dfc8985f1f1ed3f2cbe702e73199 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffc2892dce57c0f0d33351b690a26abda9a93340adf054389475f1509d26df27 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbb56aeee3902fa6f032d487c5fd80fa68cdde29 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3096335b50ddd64e5cce779bb5a97f0338ebdd21b257d2af39d5b05c2e2146aa +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..299b48d0d16e2cb23a339c9c893115cdd75aba04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58ffc0b6942fff690685bf2ff03a20bfdcbc28a4a9dbeeafa0f468b8f82ae1d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ed8d5a05a049d1414a980e87cfd299824d0d2ea --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c05b153dcc2f38676593b081f8aee78880062670223507c85bbd0652f9a9e7b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe4d103dc0e50de5aa55879cb48986750b27b95 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987227f69464208f7f08217faaa9f524add2569c17dedb0375525e984ae0288b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0291f13e5a367e81a2c11056064b812a52d28295 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc96c452dafce2d8540f8199af19d91f057b9fa0abe7db28f7cea2b571dd391 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a197e45872e38a31da2ef0859424e206f8ee65c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4b35799b615e524accf610c7d04133c12a4409d1be01c05ec7a60d96d9a18d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9ebe2709e7f014a6431e10a08b9ee83756b9b83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/latest @@ -0,0 +1 @@ +global_step220 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc26f1e85f4e8e85881b70bb37705b907a71e2da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192b6eaac6b92a2de7d039b2fc8b1f373bff6953e1e6a952189b56167078edd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..597b599c32cb97bb28b361dbfe4532bfc1fad50b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/trainer_state.json @@ -0,0 +1,1030 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 4.4, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.7365315552744873, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.35546875, + "logps/chosen": -120.0, + "logps/rejected": -520.0, + "loss": 0.26008996963500974, + "memory(GiB)": 77.63, + "nll_loss": 0.10498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.75, + "rewards/margins": 25.375, + "rewards/rejected": -11.5625, + "step": 165, + "train_speed(iter/s)": 0.064049 + }, + { + "epoch": 3.4, + "grad_norm": 0.4367548710430112, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.1796875, + "logits/rejected": -0.6328125, + "logps/chosen": -466.0, + "logps/rejected": -171.0, + "loss": 0.22614412307739257, + "memory(GiB)": 77.63, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 24.75, + "rewards/rejected": 1.578125, + "step": 170, + "train_speed(iter/s)": 0.063846 + }, + { + "epoch": 3.5, + "grad_norm": 0.5745096147296118, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.56640625, + "logits/rejected": 0.0125732421875, + "logps/chosen": -255.0, + "logps/rejected": -540.0, + "loss": 0.24147272109985352, + "memory(GiB)": 77.63, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.625, + "rewards/margins": 28.75, + "rewards/rejected": -4.125, + "step": 175, + "train_speed(iter/s)": 0.063902 + }, + { + "epoch": 3.6, + "grad_norm": 0.3506997416185766, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.306640625, + "logits/rejected": -0.6953125, + "logps/chosen": -394.0, + "logps/rejected": -284.0, + "loss": 0.30578501224517823, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.5, + "rewards/margins": 28.0, + "rewards/rejected": -2.515625, + "step": 180, + "train_speed(iter/s)": 0.064229 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.302734375, + "eval_logps/chosen": -6.96875, + "eval_logps/rejected": -318.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.625, + "eval_rewards/rejected": -9.3125, + "eval_runtime": 7.0569, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.5026347543849498, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.13671875, + "logits/rejected": -0.478515625, + "logps/chosen": -360.0, + "logps/rejected": -221.0, + "loss": 0.262108588218689, + "memory(GiB)": 77.63, + "nll_loss": 0.41796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.375, + "rewards/margins": 24.875, + "rewards/rejected": -0.4765625, + "step": 185, + "train_speed(iter/s)": 0.06422 + }, + { + "epoch": 3.8, + "grad_norm": 0.45140535411962257, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.58203125, + "logits/rejected": -0.486328125, + "logps/chosen": -185.0, + "logps/rejected": -604.0, + "loss": 0.25142607688903806, + "memory(GiB)": 77.63, + "nll_loss": 0.2470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.25, + "rewards/margins": 31.125, + "rewards/rejected": -6.90625, + "step": 190, + "train_speed(iter/s)": 0.064487 + }, + { + "epoch": 3.9, + "grad_norm": 0.560007261391276, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.47265625, + "logits/rejected": -0.2177734375, + "logps/chosen": -332.0, + "logps/rejected": -656.0, + "loss": 0.2213657855987549, + "memory(GiB)": 77.63, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.75, + "rewards/margins": 31.125, + "rewards/rejected": -10.375, + "step": 195, + "train_speed(iter/s)": 0.064327 + }, + { + "epoch": 4.0, + "grad_norm": 0.2915492783630129, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.056396484375, + "logits/rejected": -0.166015625, + "logps/chosen": -232.0, + "logps/rejected": -668.0, + "loss": 0.2468355655670166, + "memory(GiB)": 77.63, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.375, + "rewards/margins": 31.5, + "rewards/rejected": -10.25, + "step": 200, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.3203125, + "eval_logps/chosen": -6.15625, + "eval_logps/rejected": -316.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.29296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.5, + "eval_rewards/rejected": -9.125, + "eval_runtime": 7.0303, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.506342030165448, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.2109375, + "logits/rejected": -0.2158203125, + "logps/chosen": -272.0, + "logps/rejected": -652.0, + "loss": 0.2667506217956543, + "memory(GiB)": 77.63, + "nll_loss": 0.2412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 28.5, + "rewards/rejected": -6.34375, + "step": 205, + "train_speed(iter/s)": 0.063779 + }, + { + "epoch": 4.2, + "grad_norm": 0.5496148316299561, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.26953125, + "logits/rejected": -0.5625, + "logps/chosen": -185.0, + "logps/rejected": -238.0, + "loss": 0.20104532241821288, + "memory(GiB)": 77.63, + "nll_loss": 0.2119140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.375, + "rewards/margins": 25.375, + "rewards/rejected": -0.034423828125, + "step": 210, + "train_speed(iter/s)": 0.063893 + }, + { + "epoch": 4.3, + "grad_norm": 0.44211747113636407, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.4375, + "logits/rejected": 0.0162353515625, + "logps/chosen": -120.0, + "logps/rejected": -1128.0, + "loss": 0.24174799919128417, + "memory(GiB)": 77.63, + "nll_loss": 0.11572265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.625, + "rewards/margins": 36.75, + "rewards/rejected": -19.0, + "step": 215, + "train_speed(iter/s)": 0.06391 + }, + { + "epoch": 4.4, + "grad_norm": 0.38452329355351084, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.50390625, + "logits/rejected": -0.25, + "logps/chosen": -71.5, + "logps/rejected": -1048.0, + "loss": 0.22104406356811523, + "memory(GiB)": 77.63, + "nll_loss": 0.055908203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.125, + "rewards/margins": 37.75, + "rewards/rejected": -23.625, + "step": 220, + "train_speed(iter/s)": 0.063802 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.31640625, + "eval_logps/chosen": -6.78125, + "eval_logps/rejected": -320.0, + "eval_loss": 0.43017578125, + "eval_nll_loss": 0.32421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.75, + "eval_rewards/rejected": -9.5, + "eval_runtime": 7.0342, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 220 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1215557845647360.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-220/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3cbc13314a02264bf7838ba4e415309400b807ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d6e2dabbf776710fe852499bf81b9e77362e430963946e40bbc081eb2b66c87 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c95df6bbfe46443f41a33d938df92102f935c585 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19b70aaa39f388c5d769f22487de11659f0f4dbe0f7971ad915247ae6aad272 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d233bbc61c0b9a4d893d8ef6e269ddccf1fcd349 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af82d5a239e2a83278ae52d54a4b9763be0465b2c5c0c7ea9e3dd7018110cc2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ecdec7397ba40d42a311f0cc41ef39c2e381583 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd069a8afcfa2a9c1fb98a489b86d68e5b3820e1ab518810b82e04120fec0f47 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7fa06fd1ba6554aa273c0fd573869a9b00541 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48bf29b55a683c1a22a23c0a34261f66c348330b83494c4c342d89fd338c21dd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fa9dd917e45aa4f931847ce7330f181069a55fa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7c44d2435c9369942ce519751928b2bbea6166b9d9f66acb8734a2f107863a0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7311ba165c81f22d49d43bf8f77c08a004d7a52a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e01ead2f54a6cdef970ba63229d623be92efd111e49886411c9552069b5e531 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac93e11a5554b58728cbf3868703730e064ecf79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93946ec32e07b716b5f79cd697aae46eadfd94ffac302dcee80d07b7d719ebb0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3972d15bd77ca65ea4decba1f56373e39ce0341b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2b79ae2e7697aaf4ad11cd7a1c974e1c73ef9e3db6c47176a2e201a9931ac1 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c5c4048eb1d529dcbe9e99a6121a38a8a507d8a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74f4291b39ea877bccddef328c62959bc2c2d769cb7381a9619d92618bc8e859 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7eccdf3e4244f8b654805d86ab9502dc4f5eec7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fbfbc2df12cd68b4a45ee00b2f35378e753888cff8d2fee66de1b21c09fe2d5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..240734631e319f63014c88fc4e9de37a330ea7a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34431ad9bf9fb39759581f279df075ede3570a4802b8cc487b1b5c04f0a99043 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afc557da3be8f3f6fe56dba58de417f9950953b8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8db6b64587b82e05310f6dea6f26f7c15db23a59e9496299490d039ca34322 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01db5ad49a094ffeaf5918b3e5fe72b658f67eeb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5634fb653619f31b3d2e26f7ca93bc49241f9b84514606f7f3049ee29c2c0719 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5d32ba8c6b706369cfa684ef45fdad23dfd878a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4761092fb1d9eb6cef1b0067e673037ad51671b4f1c043af77c6d001732903f6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee7ff4a2fb36ae046b8eec0df9c4777f67a4b672 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f334d97e640c681248521d4af909c1d4a8fa8e9c7852f6a8eb53efebc21a623e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..969cee9b67c8455d4060c64d7775f5bf32de11d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9374191d6ab47cedb4bb518431d0d2d1848b693b4bba79f4a0da99f90bd3124 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/latest new file mode 100644 index 0000000000000000000000000000000000000000..161e63cf7292b2184098d115f0621d2ed09e86c5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/latest @@ -0,0 +1 @@ +global_step240 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3a6ea45dd4e59b9683f66476f460fa0c77a9d66 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0c9979566a5d89cb3c766336548670ec6f2291deba1b7ab1764c12d3187b24 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42e6b0d6985c9b3f0cec701759e0b3d671c77abd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e36a570d6158fc25d1cf5d9f8f450fc64c5a7683330277f89ff76d5f2fc6cd +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..376994a32199299a2a48b62753947cdb1f7ad72a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f619cbef4b74f1680d667c8788285a602392e63bdf3760ef3a59ec8864d483 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f1edb2dfec55e5cbead7ae3d14351c3650c4f77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc037fba93ace1bf7ce01b1a5f7d785698d47b4cc2cedf2300bbf7a41ebf05c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..016d34db4ec6597c207021d026234c9692c3f3ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab728c2461d6d1c64f04d7cbfdfcbfa7bd7ad0ef6e19d52458501ee81b27128 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d7824c2bd9e8b1cec7f0d84d673017b0da62e43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27530e653ebf5997ae3159cdcde264607e6a6f86b7e3c7a1b3a1e8301cd43d03 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f41ee261ad98d2d0eb8f09167a5b32604513b56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fddaeb1257697bd7c0101abf1ab23f2925d0d9165cd8bddfbd22f8444db2b7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8816834cc1c0e822e11a8df138fa41557f3a0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942af3734a320fe12a3205a47ca1cdc7d1f0996bfde86c020a35545ccd2fd418 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce5faf9896aeadd65d47acddb4b510a6fc3c65f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a46b33bfe1e26ebea81904070b93f8e7376ae49add370042b1998521eed8ba +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f77e3bcbaf15bbef8a5add1dfa60fd60d5f1a11 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/trainer_state.json @@ -0,0 +1,1119 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 4.8, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.7365315552744873, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.35546875, + "logps/chosen": -120.0, + "logps/rejected": -520.0, + "loss": 0.26008996963500974, + "memory(GiB)": 77.63, + "nll_loss": 0.10498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.75, + "rewards/margins": 25.375, + "rewards/rejected": -11.5625, + "step": 165, + "train_speed(iter/s)": 0.064049 + }, + { + "epoch": 3.4, + "grad_norm": 0.4367548710430112, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.1796875, + "logits/rejected": -0.6328125, + "logps/chosen": -466.0, + "logps/rejected": -171.0, + "loss": 0.22614412307739257, + "memory(GiB)": 77.63, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 24.75, + "rewards/rejected": 1.578125, + "step": 170, + "train_speed(iter/s)": 0.063846 + }, + { + "epoch": 3.5, + "grad_norm": 0.5745096147296118, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.56640625, + "logits/rejected": 0.0125732421875, + "logps/chosen": -255.0, + "logps/rejected": -540.0, + "loss": 0.24147272109985352, + "memory(GiB)": 77.63, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.625, + "rewards/margins": 28.75, + "rewards/rejected": -4.125, + "step": 175, + "train_speed(iter/s)": 0.063902 + }, + { + "epoch": 3.6, + "grad_norm": 0.3506997416185766, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.306640625, + "logits/rejected": -0.6953125, + "logps/chosen": -394.0, + "logps/rejected": -284.0, + "loss": 0.30578501224517823, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.5, + "rewards/margins": 28.0, + "rewards/rejected": -2.515625, + "step": 180, + "train_speed(iter/s)": 0.064229 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.302734375, + "eval_logps/chosen": -6.96875, + "eval_logps/rejected": -318.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.625, + "eval_rewards/rejected": -9.3125, + "eval_runtime": 7.0569, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.5026347543849498, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.13671875, + "logits/rejected": -0.478515625, + "logps/chosen": -360.0, + "logps/rejected": -221.0, + "loss": 0.262108588218689, + "memory(GiB)": 77.63, + "nll_loss": 0.41796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.375, + "rewards/margins": 24.875, + "rewards/rejected": -0.4765625, + "step": 185, + "train_speed(iter/s)": 0.06422 + }, + { + "epoch": 3.8, + "grad_norm": 0.45140535411962257, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.58203125, + "logits/rejected": -0.486328125, + "logps/chosen": -185.0, + "logps/rejected": -604.0, + "loss": 0.25142607688903806, + "memory(GiB)": 77.63, + "nll_loss": 0.2470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.25, + "rewards/margins": 31.125, + "rewards/rejected": -6.90625, + "step": 190, + "train_speed(iter/s)": 0.064487 + }, + { + "epoch": 3.9, + "grad_norm": 0.560007261391276, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.47265625, + "logits/rejected": -0.2177734375, + "logps/chosen": -332.0, + "logps/rejected": -656.0, + "loss": 0.2213657855987549, + "memory(GiB)": 77.63, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.75, + "rewards/margins": 31.125, + "rewards/rejected": -10.375, + "step": 195, + "train_speed(iter/s)": 0.064327 + }, + { + "epoch": 4.0, + "grad_norm": 0.2915492783630129, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.056396484375, + "logits/rejected": -0.166015625, + "logps/chosen": -232.0, + "logps/rejected": -668.0, + "loss": 0.2468355655670166, + "memory(GiB)": 77.63, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.375, + "rewards/margins": 31.5, + "rewards/rejected": -10.25, + "step": 200, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.3203125, + "eval_logps/chosen": -6.15625, + "eval_logps/rejected": -316.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.29296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.5, + "eval_rewards/rejected": -9.125, + "eval_runtime": 7.0303, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.506342030165448, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.2109375, + "logits/rejected": -0.2158203125, + "logps/chosen": -272.0, + "logps/rejected": -652.0, + "loss": 0.2667506217956543, + "memory(GiB)": 77.63, + "nll_loss": 0.2412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 28.5, + "rewards/rejected": -6.34375, + "step": 205, + "train_speed(iter/s)": 0.063779 + }, + { + "epoch": 4.2, + "grad_norm": 0.5496148316299561, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.26953125, + "logits/rejected": -0.5625, + "logps/chosen": -185.0, + "logps/rejected": -238.0, + "loss": 0.20104532241821288, + "memory(GiB)": 77.63, + "nll_loss": 0.2119140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.375, + "rewards/margins": 25.375, + "rewards/rejected": -0.034423828125, + "step": 210, + "train_speed(iter/s)": 0.063893 + }, + { + "epoch": 4.3, + "grad_norm": 0.44211747113636407, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.4375, + "logits/rejected": 0.0162353515625, + "logps/chosen": -120.0, + "logps/rejected": -1128.0, + "loss": 0.24174799919128417, + "memory(GiB)": 77.63, + "nll_loss": 0.11572265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.625, + "rewards/margins": 36.75, + "rewards/rejected": -19.0, + "step": 215, + "train_speed(iter/s)": 0.06391 + }, + { + "epoch": 4.4, + "grad_norm": 0.38452329355351084, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.50390625, + "logits/rejected": -0.25, + "logps/chosen": -71.5, + "logps/rejected": -1048.0, + "loss": 0.22104406356811523, + "memory(GiB)": 77.63, + "nll_loss": 0.055908203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.125, + "rewards/margins": 37.75, + "rewards/rejected": -23.625, + "step": 220, + "train_speed(iter/s)": 0.063802 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.31640625, + "eval_logps/chosen": -6.78125, + "eval_logps/rejected": -320.0, + "eval_loss": 0.43017578125, + "eval_nll_loss": 0.32421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.75, + "eval_rewards/rejected": -9.5, + "eval_runtime": 7.0342, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.3709354527062318, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.62109375, + "logps/chosen": -476.0, + "logps/rejected": -209.0, + "loss": 0.24149389266967775, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 31.75, + "rewards/margins": 27.375, + "rewards/rejected": 4.34375, + "step": 225, + "train_speed(iter/s)": 0.063634 + }, + { + "epoch": 4.6, + "grad_norm": 0.4235065854735603, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.09521484375, + "logits/rejected": -0.53125, + "logps/chosen": -210.0, + "logps/rejected": -804.0, + "loss": 0.20839576721191405, + "memory(GiB)": 77.63, + "nll_loss": 0.265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.125, + "rewards/margins": 35.5, + "rewards/rejected": -11.4375, + "step": 230, + "train_speed(iter/s)": 0.063609 + }, + { + "epoch": 4.7, + "grad_norm": 0.7452796123378559, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -0.5078125, + "logits/rejected": -0.2470703125, + "logps/chosen": -153.0, + "logps/rejected": -772.0, + "loss": 0.17243645191192628, + "memory(GiB)": 77.63, + "nll_loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 35.75, + "rewards/rejected": -18.5, + "step": 235, + "train_speed(iter/s)": 0.063676 + }, + { + "epoch": 4.8, + "grad_norm": 0.31765564689044157, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -0.451171875, + "logits/rejected": -0.640625, + "logps/chosen": -308.0, + "logps/rejected": -664.0, + "loss": 0.21198019981384278, + "memory(GiB)": 77.63, + "nll_loss": 0.44140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 28.125, + "rewards/margins": 35.0, + "rewards/rejected": -6.71875, + "step": 240, + "train_speed(iter/s)": 0.063884 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.314453125, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -320.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.333984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.75, + "eval_rewards/rejected": -9.5, + "eval_runtime": 7.1052, + "eval_samples_per_second": 0.563, + "eval_steps_per_second": 0.141, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1322897605394432.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-240/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca3499511f3d3f2e58bdaebeca238d36d3c844f2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78f66fca9a88da3bda53748617ce75b909b84b09b65c466f92718126db4de222 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1a833417a3a27effe38bb26a54690588c05215c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b9775ce580626b30ee19e39e23b93ad0eeafc7915deeaecf70f827ad6decc3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08582f9a3fc04d1663c0728a4d3ca523291916b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15129ace0c420870567eb51ab4e98c75aedc6f9a25406b9661a33e103169b1e4 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..febec38edbcf49d10f9ebc52854403a277d53c1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50261dee7bc853590041c2faab2eb90ad64e21e77d61d45bdfa9ee29d7c39dfc +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ede52281a84eec09246c4b272c5156ae993f134 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bed1bfa8123e705edd109ab4bc927981900aff3fbdc576b9d5ef08e5b700e7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c7e7e9a25e44e3c5fa911d27ea7bf561c528ce0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e6a050ebbb523cfc3f330137ab1196b4063c72d9350a1cebc0b4427492f475 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2d767901066a7da5ba1b7673b2e349557e39f6c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1260d5e0f0cde94da9ab0a1186aec0aa2af54e810e09e8e4ec69715b67864e95 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ced808beddefaba59d3fd26d729d77e684f3b95 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5d0c2fa3b83a8fdc82e05e1b51f66072a5ff56e406fa8e7c151be708555d80 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b139a10852a4fbcf4c87b981470e2dbf3e3ea48f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76083754fa8d14641c6e5b746c83a6ba6cb3f3d25b5aae1cdf81923c1f6bd1a8 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6694a24fdc3c4cf0ded2aec93e744dd0bd838be0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01daf2aa78d1508af6af8b682498b59f1acea14b32e777d57501e304d8c4f9da +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d6615675ca9f69d9f07495b8090cc443518c547 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1cc6a70df9ae3d1ee7bc929c4c0bbfe4e3731428092e85327b35e30a5ae8237 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5086dae836696305af01de0b638502e70d220332 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc49f0e17fcdddd2eaee705127fa9e712f220206c8f73007ca85e65f1e725fdc +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b10c341c534752c8a43f711ad35e77d10177cf19 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572fd43ac65ad0fa936ca11d5ee944d01cf4b35dcc4992af46eca8847cabe07b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..261d3b91781ccf654958e654abcc4927a41e6f82 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4948cd49ac00314d113278574704e1b5cc94f34b1b2b1178be59a9efa13970 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59c947868950aaa46e2a3119b5dd046eaee51262 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a985abf4386f9d35749282fd267d92c1ea26b4cbfd18b7872354dddefd8a8a57 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..016569b53b1ce7e2ff8a1ac91592a572a998a0b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be01680f5078d9975ab330bee60c14207fbd678d6bb4802404169479ee609ccf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1885afb3725fc0170ede2ef1d86f2e4dc7872a50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c89989676332620d98f9299db3284bd7484bda9219fe58d938d654e8f786bbf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/latest new file mode 100644 index 0000000000000000000000000000000000000000..87449ff1a854ba4a77ea33fbc24adaed3311d6b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/latest @@ -0,0 +1 @@ +global_step250 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab29abc7c5c196288fd5c119c67c4f655f27d44c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5c4738c31c5c9a38e1f586256d59a0e8e7d02641b9b9af2afdbe078440aeb4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8e0ba47a098b34da66857368b41c80a5d9d796f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d374b3390eb52ec7f6161c06272d4f26cb715692bdf2ad5374287b6de420ca3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7676e48e7dd332be5f46585fc5f824c5791f76ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24111edc5a6a2994166cd410155ee3c630816d0fe21c13808ebd2a2ae45bc9d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..228202ae722c05ed5fafc13eeac33a8a2685cca5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157b21eda1c7f898e519251deed08049767ffba123797289de56343a92ba7380 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a63de21fa3e29782ced5828f8f34fba46bad33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb615552e5845759bc13aa2ae50c0525fbf941fa76ee2e2c20cb9838fe1995 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d487727115f1120e55e91ad9583fb23ff8e34083 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf720fc22147ce563d6f2c2f6f3d916a7e8b7af174b480d072b5c822e992aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90628d8fd79ee2a98fb904251b6d7938f5120b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d055d3b033dc8e6fc2a19aa95162960544ab94a903988874315efe4ed5aa8e13 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e1556a7ec04e7309f4c9130351c880ef6a0626 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e03c685f2e019350bfdd41f006495a18690aacbccd7ffc1f40de827f433eb87 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..871b4a6cbd60ea4b2ef2416f3a46bbe632ddb667 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b1af2ae92a304371e36f6c1b7001f5dafc395be0b17c480957fc7fb58d8cd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7aca0a8d33c71e26729b59fff610c09eb0a3f939 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/trainer_state.json @@ -0,0 +1,1172 @@ +{ + "best_metric": 0.40966797, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6576496300320993, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.5078125, + "logps/chosen": -217.0, + "logps/rejected": -796.0, + "loss": 0.54422607421875, + "memory(GiB)": 77.63, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.5625, + "rewards/margins": 28.5, + "rewards/rejected": -14.875, + "step": 85, + "train_speed(iter/s)": 0.065121 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545271461639326, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.375, + "logits/rejected": -0.73828125, + "logps/chosen": -235.0, + "logps/rejected": -532.0, + "loss": 0.39871826171875, + "memory(GiB)": 77.63, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.1875, + "rewards/margins": 23.125, + "rewards/rejected": -8.875, + "step": 90, + "train_speed(iter/s)": 0.064715 + }, + { + "epoch": 1.9, + "grad_norm": 0.3935793421122154, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.328125, + "logits/rejected": -0.328125, + "logps/chosen": -282.0, + "logps/rejected": -564.0, + "loss": 0.41851806640625, + "memory(GiB)": 77.63, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 26.5, + "rewards/rejected": -9.25, + "step": 95, + "train_speed(iter/s)": 0.06466 + }, + { + "epoch": 2.0, + "grad_norm": 0.36803625473665347, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.390625, + "logits/rejected": -0.42578125, + "logps/chosen": -212.0, + "logps/rejected": -384.0, + "loss": 0.38671875, + "memory(GiB)": 77.63, + "nll_loss": 0.3359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 19.125, + "rewards/rejected": -5.375, + "step": 100, + "train_speed(iter/s)": 0.064448 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.87109375, + "eval_logits/rejected": -0.45703125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -304.0, + "eval_loss": 0.40966796875, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.5, + "eval_rewards/margins": 16.375, + "eval_rewards/rejected": -7.90625, + "eval_runtime": 6.9622, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.5015689962645361, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.396484375, + "logits/rejected": -0.7109375, + "logps/chosen": -332.0, + "logps/rejected": -640.0, + "loss": 0.3717987060546875, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 26.625, + "rewards/rejected": -5.71875, + "step": 105, + "train_speed(iter/s)": 0.063793 + }, + { + "epoch": 2.2, + "grad_norm": 0.6199367674381827, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.291015625, + "logits/rejected": -0.25, + "logps/chosen": -306.0, + "logps/rejected": -1040.0, + "loss": 0.3555938720703125, + "memory(GiB)": 77.63, + "nll_loss": 0.33203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.9375, + "rewards/margins": 32.5, + "rewards/rejected": -16.5, + "step": 110, + "train_speed(iter/s)": 0.063742 + }, + { + "epoch": 2.3, + "grad_norm": 0.3369780668254126, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.2109375, + "logits/rejected": -0.44921875, + "logps/chosen": -354.0, + "logps/rejected": -696.0, + "loss": 0.38040924072265625, + "memory(GiB)": 77.63, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 26.0, + "rewards/rejected": -7.21875, + "step": 115, + "train_speed(iter/s)": 0.063906 + }, + { + "epoch": 2.4, + "grad_norm": 0.5972625645626695, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.43359375, + "logits/rejected": -0.546875, + "logps/chosen": -278.0, + "logps/rejected": -326.0, + "loss": 0.3116204261779785, + "memory(GiB)": 77.63, + "nll_loss": 0.267578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 19.0, + "rewards/rejected": -1.84375, + "step": 120, + "train_speed(iter/s)": 0.064398 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.83984375, + "eval_logits/rejected": -0.3515625, + "eval_logps/chosen": -5.84375, + "eval_logps/rejected": -306.0, + "eval_loss": 0.4169921875, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 16.5, + "eval_rewards/rejected": -8.125, + "eval_runtime": 7.0281, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619924076423551, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.859375, + "logits/rejected": -0.5078125, + "logps/chosen": -17.5, + "logps/rejected": -1072.0, + "loss": 0.32363739013671877, + "memory(GiB)": 77.63, + "nll_loss": 0.2177734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.25, + "rewards/margins": 28.25, + "rewards/rejected": -16.0, + "step": 125, + "train_speed(iter/s)": 0.064435 + }, + { + "epoch": 2.6, + "grad_norm": 1.419414316974697, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.52734375, + "logits/rejected": -0.66015625, + "logps/chosen": -304.0, + "logps/rejected": -278.0, + "loss": 0.31024856567382814, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 21.75, + "rewards/rejected": -2.640625, + "step": 130, + "train_speed(iter/s)": 0.064846 + }, + { + "epoch": 2.7, + "grad_norm": 1.043372967452423, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.484375, + "logits/rejected": -0.34375, + "logps/chosen": -116.0, + "logps/rejected": -664.0, + "loss": 0.29241142272949217, + "memory(GiB)": 77.63, + "nll_loss": 0.126953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.3125, + "rewards/margins": 26.0, + "rewards/rejected": -13.625, + "step": 135, + "train_speed(iter/s)": 0.064914 + }, + { + "epoch": 2.8, + "grad_norm": 0.424388253912613, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.4765625, + "logits/rejected": -0.62890625, + "logps/chosen": -276.0, + "logps/rejected": -684.0, + "loss": 0.33972806930541993, + "memory(GiB)": 77.63, + "nll_loss": 0.498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.125, + "rewards/margins": 26.0, + "rewards/rejected": -7.8125, + "step": 140, + "train_speed(iter/s)": 0.065209 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.25, + "eval_logps/chosen": -5.71875, + "eval_logps/rejected": -308.0, + "eval_loss": 0.4150390625, + "eval_nll_loss": 0.271484375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -8.3125, + "eval_runtime": 7.0195, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6261110291294545, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.1591796875, + "logits/rejected": -0.41796875, + "logps/chosen": -434.0, + "logps/rejected": -652.0, + "loss": 0.3439308166503906, + "memory(GiB)": 77.63, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 27.125, + "rewards/rejected": -6.90625, + "step": 145, + "train_speed(iter/s)": 0.064432 + }, + { + "epoch": 3.0, + "grad_norm": 0.3948730156432553, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": -0.013671875, + "logits/rejected": -0.451171875, + "logps/chosen": -328.0, + "logps/rejected": -284.0, + "loss": 0.264852237701416, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 23.625, + "rewards/rejected": -1.6640625, + "step": 150, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 3.1, + "grad_norm": 0.39090195989326426, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.3671875, + "logits/rejected": -0.5234375, + "logps/chosen": -434.0, + "logps/rejected": -648.0, + "loss": 0.31949734687805176, + "memory(GiB)": 77.63, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.125, + "rewards/margins": 32.5, + "rewards/rejected": -11.4375, + "step": 155, + "train_speed(iter/s)": 0.064309 + }, + { + "epoch": 3.2, + "grad_norm": 0.31310930959607486, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.47265625, + "logits/rejected": -0.765625, + "logps/chosen": -320.0, + "logps/rejected": -382.0, + "loss": 0.2711037635803223, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.75, + "rewards/margins": 24.0, + "rewards/rejected": -0.265625, + "step": 160, + "train_speed(iter/s)": 0.064187 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -0.81640625, + "eval_logits/rejected": -0.3125, + "eval_logps/chosen": -5.90625, + "eval_logps/rejected": -312.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.0, + "eval_rewards/rejected": -8.6875, + "eval_runtime": 7.0385, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.7365315552744873, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.35546875, + "logps/chosen": -120.0, + "logps/rejected": -520.0, + "loss": 0.26008996963500974, + "memory(GiB)": 77.63, + "nll_loss": 0.10498046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.75, + "rewards/margins": 25.375, + "rewards/rejected": -11.5625, + "step": 165, + "train_speed(iter/s)": 0.064049 + }, + { + "epoch": 3.4, + "grad_norm": 0.4367548710430112, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.1796875, + "logits/rejected": -0.6328125, + "logps/chosen": -466.0, + "logps/rejected": -171.0, + "loss": 0.22614412307739257, + "memory(GiB)": 77.63, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 24.75, + "rewards/rejected": 1.578125, + "step": 170, + "train_speed(iter/s)": 0.063846 + }, + { + "epoch": 3.5, + "grad_norm": 0.5745096147296118, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.56640625, + "logits/rejected": 0.0125732421875, + "logps/chosen": -255.0, + "logps/rejected": -540.0, + "loss": 0.24147272109985352, + "memory(GiB)": 77.63, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.625, + "rewards/margins": 28.75, + "rewards/rejected": -4.125, + "step": 175, + "train_speed(iter/s)": 0.063902 + }, + { + "epoch": 3.6, + "grad_norm": 0.3506997416185766, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.306640625, + "logits/rejected": -0.6953125, + "logps/chosen": -394.0, + "logps/rejected": -284.0, + "loss": 0.30578501224517823, + "memory(GiB)": 77.63, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.5, + "rewards/margins": 28.0, + "rewards/rejected": -2.515625, + "step": 180, + "train_speed(iter/s)": 0.064229 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.302734375, + "eval_logps/chosen": -6.96875, + "eval_logps/rejected": -318.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.625, + "eval_rewards/rejected": -9.3125, + "eval_runtime": 7.0569, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.5026347543849498, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.13671875, + "logits/rejected": -0.478515625, + "logps/chosen": -360.0, + "logps/rejected": -221.0, + "loss": 0.262108588218689, + "memory(GiB)": 77.63, + "nll_loss": 0.41796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.375, + "rewards/margins": 24.875, + "rewards/rejected": -0.4765625, + "step": 185, + "train_speed(iter/s)": 0.06422 + }, + { + "epoch": 3.8, + "grad_norm": 0.45140535411962257, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.58203125, + "logits/rejected": -0.486328125, + "logps/chosen": -185.0, + "logps/rejected": -604.0, + "loss": 0.25142607688903806, + "memory(GiB)": 77.63, + "nll_loss": 0.2470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.25, + "rewards/margins": 31.125, + "rewards/rejected": -6.90625, + "step": 190, + "train_speed(iter/s)": 0.064487 + }, + { + "epoch": 3.9, + "grad_norm": 0.560007261391276, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.47265625, + "logits/rejected": -0.2177734375, + "logps/chosen": -332.0, + "logps/rejected": -656.0, + "loss": 0.2213657855987549, + "memory(GiB)": 77.63, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.75, + "rewards/margins": 31.125, + "rewards/rejected": -10.375, + "step": 195, + "train_speed(iter/s)": 0.064327 + }, + { + "epoch": 4.0, + "grad_norm": 0.2915492783630129, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.056396484375, + "logits/rejected": -0.166015625, + "logps/chosen": -232.0, + "logps/rejected": -668.0, + "loss": 0.2468355655670166, + "memory(GiB)": 77.63, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.375, + "rewards/margins": 31.5, + "rewards/rejected": -10.25, + "step": 200, + "train_speed(iter/s)": 0.064146 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.8125, + "eval_logits/rejected": -0.3203125, + "eval_logps/chosen": -6.15625, + "eval_logps/rejected": -316.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.29296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.375, + "eval_rewards/margins": 17.5, + "eval_rewards/rejected": -9.125, + "eval_runtime": 7.0303, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.506342030165448, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.2109375, + "logits/rejected": -0.2158203125, + "logps/chosen": -272.0, + "logps/rejected": -652.0, + "loss": 0.2667506217956543, + "memory(GiB)": 77.63, + "nll_loss": 0.2412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 28.5, + "rewards/rejected": -6.34375, + "step": 205, + "train_speed(iter/s)": 0.063779 + }, + { + "epoch": 4.2, + "grad_norm": 0.5496148316299561, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.26953125, + "logits/rejected": -0.5625, + "logps/chosen": -185.0, + "logps/rejected": -238.0, + "loss": 0.20104532241821288, + "memory(GiB)": 77.63, + "nll_loss": 0.2119140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.375, + "rewards/margins": 25.375, + "rewards/rejected": -0.034423828125, + "step": 210, + "train_speed(iter/s)": 0.063893 + }, + { + "epoch": 4.3, + "grad_norm": 0.44211747113636407, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.4375, + "logits/rejected": 0.0162353515625, + "logps/chosen": -120.0, + "logps/rejected": -1128.0, + "loss": 0.24174799919128417, + "memory(GiB)": 77.63, + "nll_loss": 0.11572265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.625, + "rewards/margins": 36.75, + "rewards/rejected": -19.0, + "step": 215, + "train_speed(iter/s)": 0.06391 + }, + { + "epoch": 4.4, + "grad_norm": 0.38452329355351084, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.50390625, + "logits/rejected": -0.25, + "logps/chosen": -71.5, + "logps/rejected": -1048.0, + "loss": 0.22104406356811523, + "memory(GiB)": 77.63, + "nll_loss": 0.055908203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.125, + "rewards/margins": 37.75, + "rewards/rejected": -23.625, + "step": 220, + "train_speed(iter/s)": 0.063802 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.31640625, + "eval_logps/chosen": -6.78125, + "eval_logps/rejected": -320.0, + "eval_loss": 0.43017578125, + "eval_nll_loss": 0.32421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.75, + "eval_rewards/rejected": -9.5, + "eval_runtime": 7.0342, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.3709354527062318, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.62109375, + "logps/chosen": -476.0, + "logps/rejected": -209.0, + "loss": 0.24149389266967775, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 31.75, + "rewards/margins": 27.375, + "rewards/rejected": 4.34375, + "step": 225, + "train_speed(iter/s)": 0.063634 + }, + { + "epoch": 4.6, + "grad_norm": 0.4235065854735603, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.09521484375, + "logits/rejected": -0.53125, + "logps/chosen": -210.0, + "logps/rejected": -804.0, + "loss": 0.20839576721191405, + "memory(GiB)": 77.63, + "nll_loss": 0.265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.125, + "rewards/margins": 35.5, + "rewards/rejected": -11.4375, + "step": 230, + "train_speed(iter/s)": 0.063609 + }, + { + "epoch": 4.7, + "grad_norm": 0.7452796123378559, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -0.5078125, + "logits/rejected": -0.2470703125, + "logps/chosen": -153.0, + "logps/rejected": -772.0, + "loss": 0.17243645191192628, + "memory(GiB)": 77.63, + "nll_loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 35.75, + "rewards/rejected": -18.5, + "step": 235, + "train_speed(iter/s)": 0.063676 + }, + { + "epoch": 4.8, + "grad_norm": 0.31765564689044157, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -0.451171875, + "logits/rejected": -0.640625, + "logps/chosen": -308.0, + "logps/rejected": -664.0, + "loss": 0.21198019981384278, + "memory(GiB)": 77.63, + "nll_loss": 0.44140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 28.125, + "rewards/margins": 35.0, + "rewards/rejected": -6.71875, + "step": 240, + "train_speed(iter/s)": 0.063884 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.314453125, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -320.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.333984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 17.75, + "eval_rewards/rejected": -9.5, + "eval_runtime": 7.1052, + "eval_samples_per_second": 0.563, + "eval_steps_per_second": 0.141, + "step": 240 + }, + { + "epoch": 4.9, + "grad_norm": 0.44758290651304805, + "learning_rate": 1.0978021666005478e-07, + "logits/chosen": -0.2734375, + "logits/rejected": -0.609375, + "logps/chosen": -416.0, + "logps/rejected": -520.0, + "loss": 0.257675838470459, + "memory(GiB)": 77.63, + "nll_loss": 0.41796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.875, + "rewards/margins": 27.0, + "rewards/rejected": -2.078125, + "step": 245, + "train_speed(iter/s)": 0.063828 + }, + { + "epoch": 5.0, + "grad_norm": 0.3098869617371432, + "learning_rate": 0.0, + "logits/chosen": -0.5, + "logits/rejected": -0.6328125, + "logps/chosen": -274.0, + "logps/rejected": -508.0, + "loss": 0.2909207820892334, + "memory(GiB)": 77.63, + "nll_loss": 0.30078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 32.5, + "rewards/rejected": -9.375, + "step": 250, + "train_speed(iter/s)": 0.064005 + }, + { + "epoch": 5.0, + "eval_logits/chosen": -0.80859375, + "eval_logits/rejected": -0.31640625, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -322.0, + "eval_loss": 0.4306640625, + "eval_nll_loss": 0.326171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.3125, + "eval_rewards/margins": 18.0, + "eval_rewards/rejected": -9.6875, + "eval_runtime": 7.0302, + "eval_samples_per_second": 0.569, + "eval_steps_per_second": 0.142, + "step": 250 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1376108982829056.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b3ff9d9d49aebd33b0bb8baf9c2951527e786aa1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:444e56ec17eacc670383eaa8842a2f9b1d786d4b8e00bdc2a7c02e6446bbd5ca +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fbd327325a2a4e6aa91827257a86106715ecc2b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fca740f5d24ff7e575ed419596968b3ae5cfa0e012e848c6afebeee3aa8d7ee +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfa6c86593d3f2d9d7a7edcb32cf51b7c8cd9ebc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149d9e8f4dfca54ed0308e757bbd48e166d1dc9aa338a07bb34d2cffa18e7a7d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f0789e246a47b9b35af7652318e1c12ff3a5a7b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:598011b82e5d668344f3f481df1b020f31e467ad7404203cb1af1a0404906b4b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3948ee081c076de6a59a482407a2e6b50d5b593 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9573064a893ebd24ab6912c7bbe6cd9d01bd45dc376bbd70b6adf949f55caf +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f900ad464d67e889f04c1e0a6098f8a0f08dba75 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff159b748e83a37b8daf49d6417a317a16e7a47c252a618f5923219bd1fecbe2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df6ab9ebc2abfcde662c057ced3ef9b21e611f07 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f9f97138c024f447cb8fb4bb519ba16156eff2bd4fef8e0751f73983020f80 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..228f9d42bbce2fafc85ec96256bbc64860b6db4f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8339d207e0e9be3a3db8b1b58b06a1f6549d131e65af79438d45424a260ee907 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6f3135d25475645db746010d9bed32cd85a537d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306a20ccea025a34fa07adfe2ee74eb244225ea06c6435fb4613a9c2ba06b622 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bcffb48ed29cf935e87696b06d3800f19de4ffa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c8ffd4611bb09396e08d380ff3ce10b0085aee05d094fb471060a45507e1ea +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9754dc4cea53aa9bc44de5358330e3b1e1475be6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e25bca31d4189fce4d27b2e0ecc9a65725427d6ea102128c1eab6be28d7ba8 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e2efca5c9bfc8e701f74a206aab727a4bfe8907 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547af713cb635c93bc314e50ef3f97bb67a5c0b2c1db4da2ba4daa9eb54be31e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e6ee38440e2979effa4f34ba2c9d18a28633c4d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28eaa9cec67ff4a549999914d0e6318bf95f75e02114d276b83d75ddf83773ab +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6450e6693c793db3c0635360f8f46be545e3f06c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292d7133c261a316449969f7acb1c8765251b9c91a483e0cfdebc5902487e90d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64b019aade36ffb046a822b6feabefbe9d71ba6f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422be9302c3d118d4908e04c7014d15e9697c557e19307a0e0706b4532c9328d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ccdee1cb5a4f76e3c6215cb6aec5de27d26851 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68869f865c86ce9f47b6a66ffc7ea070031db04d2458cbe22bea681d78c1516 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..393ad6b3854075efb4a2a4e7aa2d888ebb7932b4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89dd26b5d1c7d6cc6f29cc02b865025ea51c3bed626dd096bbbc6e1cb3e2321f +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..35252259eb09d8de259231f63f19e786e44bc7b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb7d8df6ed170dd98dba8737bc9dd038af61afd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e144a445ffd57fbb5be9b5131f17149bde6c4ff5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10f35268ac2a0cb68abc0b78ba5b150b0f29d78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ef21562e384e0889ec2400e8f84b6b0bc59035 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..126662e5f97dd3f9cd7fb87e70843d3d5532dde3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4e6b27cc99b0fa8e6bbf967892f9304b444d81d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..14e214a61e5311f2b7edf2200ec0365ed2dcc5e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f899c511136dcc56c12c5058062d17686812d1b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab67fe2c6ce9ef0b836ccb0828d5bb1b593055af205b40af49c85139e46a6c8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f4e79e624379e7f6cdca21cf2cd98b8d6d9abb9b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.42773438, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40", + "epoch": 0.8, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 221393861476352.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6cefdcff090965a32975cc78ae169df44132d57a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2417679d063a47a8e988d713c948e5d6c313b7a7b7a83eb17fe767320c8ff28c +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8ed63331744de178113514591099d59b69520e7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78cb10448e88cf2707f6a55a15fe4336caf92142688d23d09421e5f82686340e +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e092fe2da5695e45a0935a350d2604d259de0a8f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2b17c58f8210e6c51356248adc58b4d4fcc134b37dc6f1319c747ff8188c102 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcdf7e3a1368783597ee3a38712e6ac6857e5941 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac41d3272595f0d039a0265c55e2842f7bb7e09bcc0d0504384a1b5d48a2d46 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa24f985d8d9798427738b0e138bc14786e28480 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b282b568cd976c9db6b12167aad1c7cc1cd6659ea2a448dc0397001dff700f41 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d1f8536f78b6248237b8dc2d02e8d93937fd203 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:323f88f99572606b9ab954f85410ec3982d4fc5399fc53ef32b7d4119f065a3a +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9d6f7a947d21a445cb82e61d88d49562da748f0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d326ad174a9642e8e3bfc16b713ffe8fafed0e1913baaf81184f119942cabbcd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1e6a1577bb313e63073481d04f70be31b7de1f4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9d349f5fe51f05e6dd2ac0504cc69a5e17ae1e5d7fd3e8d835aa6e751295f7a +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72d103e8e8acb8f179b58af0a5f7db41e58d4f79 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27fcd8eae03e9cbeae9111db85347e13137fc36b5b2756fb47c70bd4993d1e90 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29deaea9c58661ef97ee870255342e1d172aac83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c284720c6a155eaa9f223a7164257bee0a6858a14e74d7dd38421bebb3be828 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d41cb8972ddf880b3c1a50ddacc4334a8a70f64 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4156d535ba2e53127fc1c9178c60f68f966a337f784b0bcd2fc4a2a4ad9880bf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b12bf3bb62c4250241f7bedf81dbe10b3767a994 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740baf51b654bd86129f67a1e19c8064e692d262a8c0f783c36439e62cbb93bd +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74a3f4bd80fabe6858c395d9eb1b00aa3ad40a42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d567cb37ea33f6ebec8c58a4c34940f780218e0021986c8dbe911d7ad9ec996d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..754870cad9171c351d5adf5ee1e56900ce5b2411 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a30d0eac44269c76d36e28c38af1aa7363606a24d80ebf26bbe124c16fcc3b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e446bf584ba9a71d6e633c8b7847cf5191b6837d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d468b3f5b562b3fe888d3898ffeffa1262e040b9b6e68273ac0d87116a063b4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ee8344c8591d75ecb708e55b2ce7f169dcd718 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41b50466f79426e77fbf5d50d6c30919e31c9974901356eefb01f2898a7e163 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74adff193bd72452111546fef8b88d12977c0d29 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82879572d7ae33455092506af3e3daa827253fcdd7b0ceab4b9d927debbbff3 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..728c3241a49cbd920d5df86255fc8be4d97c5519 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa3ba485fff4300fd9029c17ba92c92630af852a00df0a0e8d16c233f74cbc8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..183808804dd9c620a909047f45f1a46b5cd0aa5b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.41845703, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60", + "epoch": 1.2, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 328885461319680.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e96653d6ce7bdae915b6cbf183a5fbfce189b723 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..380bfecabbab46174f7a9ba717e1a2041f58f15e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e59e939e7343bb4d824bf704435d9287c3a6a73f42f167ea442c814c73314cda +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7a23335ad0c2eed7b078244c102ee44b149a2bc3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37dd5adb842770175b1ad341fa69f834495622ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c432d0ac3efe2d8856ee269217633cc8c5aef93479a606cde3970fd6aaab23 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e92c582511bfc7d7c56e0c6331c11f93e22039e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8bc9aa07f48da20534ab2b895390db816eee6a170b98d679ccf0b4321b69368 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74c352855d7ddfef73100c79ccb00fe7b14241b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f065e6712852e97a1119e29278a60ba1f799020a359295472979021ec80c0b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f123ad535357e913491636c28c30ffd56d3aab7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42fb94d319f21981c3f0568c8b9485f1930aa819608cd6e845910494776d95f2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8918aad7cc9dd78d777a3b2ecf3de226264fd5be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e4c5838dca09677e08159ea842501dd059d0b761eb29d64fcd12db4aead4a3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2b31c488b57af98cbac1387c7c9bba117690008 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704075190a81bd536c861b8931bb16f8481777daed1f781a8be78a362453317c +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d03e73fd7c501f7d5bc83422177605642967359e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f7f64b5f1eb13d0d3ac00443a87044b18b646f29310bc6d3ab912d67bf49936 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4cf0334e3d93482d797c2a84f36aa8a8a83eeda --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ab5cca01239b37cc56285710efcb6d68438e32b1bcb60f85ace6ddaac28a87 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73e38c53f23f63a98ec9581d947d5afd6a2e98db --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e7de4e3ccba42c909313ed26f149051f892f5861085cecf4a3cf12076740a2 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f0bf213154388ad221c77d7a4bf8f700f5e4534 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89e657a3fa1296cf8b2bc16e5d67732135b80ce4671a2770264ce51ed3513b71 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..765a4c2fefda0cac967cbd088abeba9c8413d9c1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc4ae517cd158c8d9476b726d4c4309889718bca4c975a54764f6f58671baa6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7bbef9f6866dfdeb0c1b8f50404d66327426e86 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea66f33da2f5c3ca1aeb17297dc91104cf4c07d8f73e0c159761208eae994148 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50d3758a8d27aff7b69ce3ec2720332a6ed925b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71fd8aaf20583187252ea5247d72cac1fa3c08a3f5034bf612f5a1f2ba5aa5f +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f354e5143107b1786a195973935715e516ed6288 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ddc9385ff0b7b3654dc493f0db74012aab5c3cf20551f2f15e49926566e9ef6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a278b47a4846eba296205bed86e9d5885783a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb720a6b01bbb827922d7cc16f85e38ebd798c724910a1e0aa03f27f054b45db +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11fbbd7f76eaaccb51396011384493cd3e3ab681 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02027957976e2c5706e7776260b7e1dfd70c633e9d6c5c538b1b0ef230442315 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd2a62da4ca83b3b986d96dbf0eaeb82207ca93 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0628a9017696045a3a29e9eaffc71e9262d855716e773c0c3be760a1fe85bc8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ba5f3aba4388a582cd47f7f9e57cd5879b1cbd2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df342004a4d8e3626bf2a9f689fde7c8bfd6d995e14931f5496eda1f456cb6f2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27b0f7845c2b9530c3e6ed3ce232ff4e86b86122 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02096eb4e8850b91490e80e4a042e2e60f71bd2abc6a269d62c271649cb77d2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfb583fc43c6dd4395671708744cfd18c419970 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c778d3d0e7e3d5665fa0a9ecd92986609c430da08b41611d6c05dc19815a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a8c64b1f15ac655b2be2a42fe61cabe2a877704 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d978dcb0c34e022ee6750e9d86814b8c82e4965d7e07662f35f06eeac12938f3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..262e8187e6caeca12ef3b0aa923b12afd697e03d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e83399aed1d9d173c3e07b2efa8530c956b62b2b68394c2ed0d43bd8bba9d1 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..72f794e31f8d3e0c63972e5076e1ed90c52087ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606ab3ca92e3d20c327c69fdcce7f7e39bec2f2c3538b036088b255f917e3ba4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..244e7fdaa1cef2e82bd4e16afb10f32f68318bcc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276a987dd22c9093fec58921ba19f340a28f18bff635cc01324e09a3c37ac3a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e36a588df493151f57c8f73aa08129a3810c2c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee30cdff92a069fa950619177f737b278c096bc7c83c0e5bdea15a673218022 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d2089c1ac629147b582ffcfc28ad0a5abeb64a86 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.41748047, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80", + "epoch": 1.6, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.129363871319395, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.16796875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.050045 + }, + { + "epoch": 0.1, + "grad_norm": 13.47369483855226, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5078125, + "logits/rejected": 0.09326171875, + "logps/chosen": -568.0, + "logps/rejected": -232.0, + "loss": 2.149658203125, + "memory(GiB)": 30.65, + "nll_loss": 0.75390625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.125, + "rewards/margins": -0.125, + "rewards/rejected": 0.0, + "step": 5, + "train_speed(iter/s)": 0.070826 + }, + { + "epoch": 0.2, + "grad_norm": 9.281799302422854, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.38671875, + "logps/chosen": -478.0, + "logps/rejected": -458.0, + "loss": 1.72822265625, + "memory(GiB)": 35.11, + "nll_loss": 1.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.609375, + "rewards/margins": 0.48828125, + "rewards/rejected": 0.12060546875, + "step": 10, + "train_speed(iter/s)": 0.065681 + }, + { + "epoch": 0.3, + "grad_norm": 4.394517104497035, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.212890625, + "logits/rejected": 0.04931640625, + "logps/chosen": -444.0, + "logps/rejected": -928.0, + "loss": 1.05458984375, + "memory(GiB)": 57.44, + "nll_loss": 0.6328125, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.5625, + "rewards/margins": 2.859375, + "rewards/rejected": 1.703125, + "step": 15, + "train_speed(iter/s)": 0.061064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5300094805075983, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.6015625, + "logits/rejected": -0.431640625, + "logps/chosen": -300.0, + "logps/rejected": -318.0, + "loss": 0.703759765625, + "memory(GiB)": 57.44, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4375, + "rewards/margins": 4.90625, + "rewards/rejected": 1.546875, + "step": 20, + "train_speed(iter/s)": 0.062406 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.953125, + "eval_logits/rejected": -0.3828125, + "eval_logps/chosen": -9.875, + "eval_logps/rejected": -186.0, + "eval_loss": 0.53662109375, + "eval_nll_loss": 0.470703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.0, + "eval_rewards/margins": 4.09375, + "eval_rewards/rejected": 3.90625, + "eval_runtime": 7.0778, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833158966477413, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.466796875, + "logits/rejected": -0.34765625, + "logps/chosen": -255.0, + "logps/rejected": -394.0, + "loss": 0.5664306640625, + "memory(GiB)": 61.35, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.125, + "rewards/margins": 10.0625, + "rewards/rejected": 0.0810546875, + "step": 25, + "train_speed(iter/s)": 0.061629 + }, + { + "epoch": 0.6, + "grad_norm": 0.5069545206823683, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.81640625, + "logits/rejected": -0.63671875, + "logps/chosen": -312.0, + "logps/rejected": -584.0, + "loss": 0.518994140625, + "memory(GiB)": 61.35, + "nll_loss": 0.65234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.3125, + "rewards/margins": 9.6875, + "rewards/rejected": 0.6015625, + "step": 30, + "train_speed(iter/s)": 0.063686 + }, + { + "epoch": 0.7, + "grad_norm": 1.0656827195195997, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.21875, + "logits/rejected": -0.60546875, + "logps/chosen": -420.0, + "logps/rejected": -214.0, + "loss": 0.51473388671875, + "memory(GiB)": 61.35, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0625, + "rewards/margins": 12.375, + "rewards/rejected": 2.671875, + "step": 35, + "train_speed(iter/s)": 0.065515 + }, + { + "epoch": 0.8, + "grad_norm": 0.6780020698373402, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6171875, + "logits/rejected": -0.640625, + "logps/chosen": -174.0, + "logps/rejected": -368.0, + "loss": 0.45928955078125, + "memory(GiB)": 61.35, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.875, + "rewards/margins": 12.375, + "rewards/rejected": -0.53515625, + "step": 40, + "train_speed(iter/s)": 0.064272 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.38671875, + "eval_logps/chosen": -5.4375, + "eval_logps/rejected": -244.0, + "eval_loss": 0.427734375, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 10.3125, + "eval_rewards/rejected": -1.8984375, + "eval_runtime": 7.0584, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.7227421522145911, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.46484375, + "logits/rejected": -0.88671875, + "logps/chosen": -376.0, + "logps/rejected": -446.0, + "loss": 0.4487548828125, + "memory(GiB)": 70.32, + "nll_loss": 0.4296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 18.0, + "rewards/rejected": -0.259765625, + "step": 45, + "train_speed(iter/s)": 0.063461 + }, + { + "epoch": 1.0, + "grad_norm": 0.9252536379780557, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.1796875, + "logits/rejected": -0.07275390625, + "logps/chosen": -384.0, + "logps/rejected": -1016.0, + "loss": 0.442578125, + "memory(GiB)": 72.28, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.5, + "rewards/margins": 13.0, + "rewards/rejected": -0.51953125, + "step": 50, + "train_speed(iter/s)": 0.064239 + }, + { + "epoch": 1.1, + "grad_norm": 0.5443623347983845, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 0.01220703125, + "logits/rejected": -0.490234375, + "logps/chosen": -310.0, + "logps/rejected": -636.0, + "loss": 0.4554443359375, + "memory(GiB)": 77.63, + "nll_loss": 0.4609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 21.0, + "rewards/rejected": -1.8984375, + "step": 55, + "train_speed(iter/s)": 0.064341 + }, + { + "epoch": 1.2, + "grad_norm": 0.9491701785504059, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.34765625, + "logits/rejected": -0.37109375, + "logps/chosen": -173.0, + "logps/rejected": -532.0, + "loss": 0.41307373046875, + "memory(GiB)": 77.63, + "nll_loss": 0.310546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 14.25, + "rewards/rejected": -1.1953125, + "step": 60, + "train_speed(iter/s)": 0.064763 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8203125, + "eval_logits/rejected": -0.32421875, + "eval_logps/chosen": -5.46875, + "eval_logps/rejected": -270.0, + "eval_loss": 0.41845703125, + "eval_nll_loss": 0.259765625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 12.9375, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.374, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.136, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6746923195914712, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.32421875, + "logits/rejected": -0.1865234375, + "logps/chosen": -249.0, + "logps/rejected": -820.0, + "loss": 0.461590576171875, + "memory(GiB)": 77.63, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0, + "rewards/margins": 25.625, + "rewards/rejected": -12.625, + "step": 65, + "train_speed(iter/s)": 0.064043 + }, + { + "epoch": 1.4, + "grad_norm": 0.4557775784440204, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.73828125, + "logits/rejected": -0.51953125, + "logps/chosen": -47.0, + "logps/rejected": -1112.0, + "loss": 0.37837066650390627, + "memory(GiB)": 77.63, + "nll_loss": 0.4921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.0, + "rewards/margins": 34.75, + "rewards/rejected": -24.75, + "step": 70, + "train_speed(iter/s)": 0.064771 + }, + { + "epoch": 1.5, + "grad_norm": 0.4549474900285614, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.33984375, + "logits/rejected": -0.6640625, + "logps/chosen": -350.0, + "logps/rejected": -478.0, + "loss": 0.4122833251953125, + "memory(GiB)": 77.63, + "nll_loss": 0.361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 27.0, + "rewards/rejected": -10.3125, + "step": 75, + "train_speed(iter/s)": 0.064901 + }, + { + "epoch": 1.6, + "grad_norm": 0.41018476502679196, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.2255859375, + "logits/rejected": -0.439453125, + "logps/chosen": -428.0, + "logps/rejected": -153.0, + "loss": 0.42242431640625, + "memory(GiB)": 77.63, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.0, + "rewards/margins": 21.875, + "rewards/rejected": 0.1708984375, + "step": 80, + "train_speed(iter/s)": 0.065678 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.82421875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -5.625, + "eval_logps/rejected": -388.0, + "eval_loss": 0.41748046875, + "eval_nll_loss": 0.267578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 8.4375, + "eval_rewards/margins": 24.75, + "eval_rewards/rejected": -16.25, + "eval_runtime": 7.0481, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 436433847844864.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f3a1cb7ee045c8134df72deb130b4ee7bfee809 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9bb28084ed24a473e669dbb727d43063dd31ea850bf64e7453af1c30f44b6ba +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..fd8dabdd453fbd1a579feef614d2e41f08e62f41 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..53516b672cd0c49349707d8de51a2b500a37e7bf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e6edca57ff7043670b1c1a02bd8be9aeff5e631f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..4939a09c95918bd47d6855d2cd06cbb5055fac90 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..aadb94780562fda27c1ba31c9c6758bd3144504d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b1561df5d3091a5bb6980346d278f2bb2cbad898 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd3f332a0a9276590b2d2aad6821619831dd64b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2d92b2935eeea4e67c6094c7572d70d651e99b88 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..ad7bd9f7a397914122882959782e01fc52eb1408 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..0404ecd92edc492b337a7c65650729cb65bf0026 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..a5ecd3d3098f1f05692497070a9f02956264f390 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3a8580ea993408309304fdadb3db1cbad5ee96f4 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..ca2eba3d7a2db884a74e4b9bff1f6045bc500832 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..97933611536b8acda691b8294659ee4282ded3e6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..5806ea1aed32b620b16d36f4583e8a496651bda5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..f869fa37203441700940366c27c7d20493a62bae Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..aadc4f995663fc5da08456cd9b281f2c781c5fe5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f90bb9883cffecb70cb5166a80f72e2ee7428025 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e0c32eca0dcb97382d221a16e13b7e54a35b3a4f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f07746df675d76574bf315e3e304bbbd566c2748 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7d5bbac759e00821c1498f445949e7836d0a6f92 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..4cf6d0799f8b86d26ba062bb0089338585d215fc Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..55cf46673a5117efb12fe67e450be99d15b2d9be Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..24a51e8bc68d51e2581ddd91f84ffecea088f789 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..6d381d1143d59016a2d83f438f2b8883438a5b71 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..9dffe74645bed8729a0a5b26daa9e1ed02ce1d34 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f5c09f1b77b35419fb3f9580bcea4d7c7426f3f9 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..4779e8b2917abfec6216b5df82125cf617f54a8c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..e7e995ec481eb56103e4d9a73fc98d5e8a44fdd8 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..0520343f3af5197abc8389a4e142272d8f23abd2 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..d4189a4298bc308a9c10f797bd0ee6ba18263a62 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..09d2660a3c816f512f2631fd3381c847d8b8324f Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..59fb7ab74986ea62036641b005a090372c102545 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/logging.jsonl b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0526994bf56e8be5e4705723c101698150fb80b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/logging.jsonl @@ -0,0 +1,66 @@ +{"loss": 2.16796875, "grad_norm": 13.12936387, "learning_rate": 7.69e-06, "memory(GiB)": 28.09, "train_speed(iter/s)": 0.050045, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -156.0, "logps/chosen": -490.0, "logits/rejected": -0.50390625, "logits/chosen": -0.49414062, "nll_loss": 0.50390625, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "15s", "remaining_time": "1h 3m 23s"} +{"loss": 2.1496582, "grad_norm": 13.47369484, "learning_rate": 3.846e-05, "memory(GiB)": 30.65, "train_speed(iter/s)": 0.070826, "rewards/chosen": -0.125, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": -0.125, "logps/rejected": -232.0, "logps/chosen": -568.0, "logits/rejected": 0.09326172, "logits/chosen": 1.5078125, "nll_loss": 0.75390625, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "1m 5s", "remaining_time": "53m 48s"} +{"loss": 1.72822266, "grad_norm": 9.2817993, "learning_rate": 7.692e-05, "memory(GiB)": 35.11, "train_speed(iter/s)": 0.065681, "rewards/chosen": 0.609375, "rewards/rejected": 0.12060547, "rewards/accuracies": 0.60000002, "rewards/margins": 0.48828125, "logps/rejected": -458.0, "logps/chosen": -478.0, "logits/rejected": -0.38671875, "logits/chosen": -0.6015625, "nll_loss": 1.125, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "2m 27s", "remaining_time": "59m 1s"} +{"loss": 1.05458984, "grad_norm": 4.3945171, "learning_rate": 9.998e-05, "memory(GiB)": 57.44, "train_speed(iter/s)": 0.061064, "rewards/chosen": 4.5625, "rewards/rejected": 1.703125, "rewards/accuracies": 0.80000001, "rewards/margins": 2.859375, "logps/rejected": -928.0, "logps/chosen": -444.0, "logits/rejected": 0.04931641, "logits/chosen": -0.21289062, "nll_loss": 0.6328125, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "4m 0s", "remaining_time": "1h 2m 54s"} +{"loss": 0.70375977, "grad_norm": 0.53000948, "learning_rate": 9.978e-05, "memory(GiB)": 57.44, "train_speed(iter/s)": 0.062406, "rewards/chosen": 6.4375, "rewards/rejected": 1.546875, "rewards/accuracies": 1.0, "rewards/margins": 4.90625, "logps/rejected": -318.0, "logps/chosen": -300.0, "logits/rejected": -0.43164062, "logits/chosen": -0.6015625, "nll_loss": 0.4765625, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "5m 15s", "remaining_time": "1h 0m 31s"} +{"eval_loss": 0.53662109, "eval_runtime": 7.0778, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 8.0, "eval_rewards/rejected": 3.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.09375, "eval_logps/rejected": -186.0, "eval_logps/chosen": -9.875, "eval_logits/rejected": -0.3828125, "eval_logits/chosen": -0.953125, "eval_nll_loss": 0.47070312, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "5m 22s", "remaining_time": "1h 1m 52s"} +{"loss": 0.56643066, "grad_norm": 0.5833159, "learning_rate": 9.937e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.061629, "rewards/chosen": 10.125, "rewards/rejected": 0.08105469, "rewards/accuracies": 1.0, "rewards/margins": 10.0625, "logps/rejected": -394.0, "logps/chosen": -255.0, "logits/rejected": -0.34765625, "logits/chosen": -0.46679688, "nll_loss": 0.41210938, "epoch": 0.5, "global_step/max_steps": "25/250", "percentage": "10.00%", "elapsed_time": "6m 40s", "remaining_time": "1h 0m 8s"} +{"loss": 0.51899414, "grad_norm": 0.50695452, "learning_rate": 9.874e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.063686, "rewards/chosen": 10.3125, "rewards/rejected": 0.6015625, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -584.0, "logps/chosen": -312.0, "logits/rejected": -0.63671875, "logits/chosen": -0.81640625, "nll_loss": 0.65234375, "epoch": 0.6, "global_step/max_steps": "30/250", "percentage": "12.00%", "elapsed_time": "7m 46s", "remaining_time": "56m 59s"} +{"loss": 0.51473389, "grad_norm": 1.06568272, "learning_rate": 9.789e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.065515, "rewards/chosen": 15.0625, "rewards/rejected": 2.671875, "rewards/accuracies": 1.0, "rewards/margins": 12.375, "logps/rejected": -214.0, "logps/chosen": -420.0, "logits/rejected": -0.60546875, "logits/chosen": -0.21875, "nll_loss": 0.68359375, "epoch": 0.7, "global_step/max_steps": "35/250", "percentage": "14.00%", "elapsed_time": "8m 49s", "remaining_time": "54m 12s"} +{"loss": 0.45928955, "grad_norm": 0.67800207, "learning_rate": 9.683e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.064272, "rewards/chosen": 11.875, "rewards/rejected": -0.53515625, "rewards/accuracies": 1.0, "rewards/margins": 12.375, "logps/rejected": -368.0, "logps/chosen": -174.0, "logits/rejected": -0.640625, "logits/chosen": -0.6171875, "nll_loss": 0.52734375, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "10m 17s", "remaining_time": "54m 2s"} +{"eval_loss": 0.42773438, "eval_runtime": 7.0584, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -1.8984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.3125, "eval_logps/rejected": -244.0, "eval_logps/chosen": -5.4375, "eval_logits/rejected": -0.38671875, "eval_logits/chosen": -0.859375, "eval_nll_loss": 0.25976562, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "10m 24s", "remaining_time": "54m 39s"} +{"loss": 0.44875488, "grad_norm": 0.72274215, "learning_rate": 9.557e-05, "memory(GiB)": 70.32, "train_speed(iter/s)": 0.063461, "rewards/chosen": 17.75, "rewards/rejected": -0.25976562, "rewards/accuracies": 1.0, "rewards/margins": 18.0, "logps/rejected": -446.0, "logps/chosen": -376.0, "logits/rejected": -0.88671875, "logits/chosen": -0.46484375, "nll_loss": 0.4296875, "epoch": 0.9, "global_step/max_steps": "45/250", "percentage": "18.00%", "elapsed_time": "11m 44s", "remaining_time": "53m 28s"} +{"loss": 0.44257812, "grad_norm": 0.92525364, "learning_rate": 9.411e-05, "memory(GiB)": 72.28, "train_speed(iter/s)": 0.064239, "rewards/chosen": 12.5, "rewards/rejected": -0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -1016.0, "logps/chosen": -384.0, "logits/rejected": -0.07275391, "logits/chosen": -0.1796875, "nll_loss": 0.40234375, "epoch": 1.0, "global_step/max_steps": "50/250", "percentage": "20.00%", "elapsed_time": "12m 53s", "remaining_time": "51m 34s"} +{"loss": 0.45544434, "grad_norm": 0.54436233, "learning_rate": 9.245e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064341, "rewards/chosen": 19.0, "rewards/rejected": -1.8984375, "rewards/accuracies": 1.0, "rewards/margins": 21.0, "logps/rejected": -636.0, "logps/chosen": -310.0, "logits/rejected": -0.49023438, "logits/chosen": 0.01220703, "nll_loss": 0.4609375, "epoch": 1.1, "global_step/max_steps": "55/250", "percentage": "22.00%", "elapsed_time": "14m 10s", "remaining_time": "50m 14s"} +{"loss": 0.41307373, "grad_norm": 0.94917018, "learning_rate": 9.061e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064763, "rewards/chosen": 13.0625, "rewards/rejected": -1.1953125, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -532.0, "logps/chosen": -173.0, "logits/rejected": -0.37109375, "logits/chosen": -0.34765625, "nll_loss": 0.31054688, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "15m 21s", "remaining_time": "48m 38s"} +{"eval_loss": 0.41845703, "eval_runtime": 7.374, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.136, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.9375, "eval_logps/rejected": -270.0, "eval_logps/chosen": -5.46875, "eval_logits/rejected": -0.32421875, "eval_logits/chosen": -0.8203125, "eval_nll_loss": 0.25976562, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "15m 29s", "remaining_time": "49m 2s"} +{"loss": 0.46159058, "grad_norm": 0.67469232, "learning_rate": 8.858e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064043, "rewards/chosen": 13.0, "rewards/rejected": -12.625, "rewards/accuracies": 1.0, "rewards/margins": 25.625, "logps/rejected": -820.0, "logps/chosen": -249.0, "logits/rejected": -0.18652344, "logits/chosen": -0.32421875, "nll_loss": 0.46289062, "epoch": 1.3, "global_step/max_steps": "65/250", "percentage": "26.00%", "elapsed_time": "16m 50s", "remaining_time": "47m 55s"} +{"loss": 0.37837067, "grad_norm": 0.45577758, "learning_rate": 8.639e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064771, "rewards/chosen": 10.0, "rewards/rejected": -24.75, "rewards/accuracies": 1.0, "rewards/margins": 34.75, "logps/rejected": -1112.0, "logps/chosen": -47.0, "logits/rejected": -0.51953125, "logits/chosen": -0.73828125, "nll_loss": 0.4921875, "epoch": 1.4, "global_step/max_steps": "70/250", "percentage": "28.00%", "elapsed_time": "17m 56s", "remaining_time": "46m 6s"} +{"loss": 0.41228333, "grad_norm": 0.45494749, "learning_rate": 8.404e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064901, "rewards/chosen": 16.75, "rewards/rejected": -10.3125, "rewards/accuracies": 1.0, "rewards/margins": 27.0, "logps/rejected": -478.0, "logps/chosen": -350.0, "logits/rejected": -0.6640625, "logits/chosen": -0.33984375, "nll_loss": 0.36132812, "epoch": 1.5, "global_step/max_steps": "75/250", "percentage": "30.00%", "elapsed_time": "19m 10s", "remaining_time": "44m 45s"} +{"loss": 0.42242432, "grad_norm": 0.41018477, "learning_rate": 8.154e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065678, "rewards/chosen": 22.0, "rewards/rejected": 0.17089844, "rewards/accuracies": 1.0, "rewards/margins": 21.875, "logps/rejected": -153.0, "logps/chosen": -428.0, "logits/rejected": -0.43945312, "logits/chosen": 0.22558594, "nll_loss": 0.5078125, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "20m 13s", "remaining_time": "42m 58s"} +{"eval_loss": 0.41748047, "eval_runtime": 7.0481, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -16.25, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 24.75, "eval_logps/rejected": -388.0, "eval_logps/chosen": -5.625, "eval_logits/rejected": -0.32617188, "eval_logits/chosen": -0.82421875, "eval_nll_loss": 0.26757812, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "20m 20s", "remaining_time": "43m 13s"} +{"loss": 0.54422607, "grad_norm": 0.65764963, "learning_rate": 7.89e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065121, "rewards/chosen": 13.5625, "rewards/rejected": -14.875, "rewards/accuracies": 1.0, "rewards/margins": 28.5, "logps/rejected": -796.0, "logps/chosen": -217.0, "logits/rejected": -0.5078125, "logits/chosen": -0.33984375, "nll_loss": 0.30859375, "epoch": 1.7, "global_step/max_steps": "85/250", "percentage": "34.00%", "elapsed_time": "21m 40s", "remaining_time": "42m 4s"} +{"loss": 0.39871826, "grad_norm": 0.65452715, "learning_rate": 7.614e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064715, "rewards/chosen": 14.1875, "rewards/rejected": -8.875, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -532.0, "logps/chosen": -235.0, "logits/rejected": -0.73828125, "logits/chosen": -0.375, "nll_loss": 0.45703125, "epoch": 1.8, "global_step/max_steps": "90/250", "percentage": "36.00%", "elapsed_time": "23m 6s", "remaining_time": "41m 4s"} +{"loss": 0.41851807, "grad_norm": 0.39357934, "learning_rate": 7.326e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06466, "rewards/chosen": 17.25, "rewards/rejected": -9.25, "rewards/accuracies": 1.0, "rewards/margins": 26.5, "logps/rejected": -564.0, "logps/chosen": -282.0, "logits/rejected": -0.328125, "logits/chosen": -0.328125, "nll_loss": 0.37109375, "epoch": 1.9, "global_step/max_steps": "95/250", "percentage": "38.00%", "elapsed_time": "24m 24s", "remaining_time": "39m 49s"} +{"loss": 0.38671875, "grad_norm": 0.36803625, "learning_rate": 7.028e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064448, "rewards/chosen": 13.6875, "rewards/rejected": -5.375, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -384.0, "logps/chosen": -212.0, "logits/rejected": -0.42578125, "logits/chosen": -0.390625, "nll_loss": 0.3359375, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "25m 46s", "remaining_time": "38m 40s"} +{"eval_loss": 0.40966797, "eval_runtime": 6.9622, "eval_samples_per_second": 0.575, "eval_steps_per_second": 0.144, "eval_rewards/chosen": 8.5, "eval_rewards/rejected": -7.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.375, "eval_logps/rejected": -304.0, "eval_logps/chosen": -5.125, "eval_logits/rejected": -0.45703125, "eval_logits/chosen": -0.87109375, "eval_nll_loss": 0.24414062, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "25m 53s", "remaining_time": "38m 50s"} +{"loss": 0.37179871, "grad_norm": 0.501569, "learning_rate": 6.72e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063793, "rewards/chosen": 20.875, "rewards/rejected": -5.71875, "rewards/accuracies": 1.0, "rewards/margins": 26.625, "logps/rejected": -640.0, "logps/chosen": -332.0, "logits/rejected": -0.7109375, "logits/chosen": -0.39648438, "nll_loss": 0.43554688, "epoch": 2.1, "global_step/max_steps": "105/250", "percentage": "42.00%", "elapsed_time": "27m 21s", "remaining_time": "37m 46s"} +{"loss": 0.35559387, "grad_norm": 0.61993677, "learning_rate": 6.406e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063742, "rewards/chosen": 15.9375, "rewards/rejected": -16.5, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -1040.0, "logps/chosen": -306.0, "logits/rejected": -0.25, "logits/chosen": -0.29101562, "nll_loss": 0.33203125, "epoch": 2.2, "global_step/max_steps": "110/250", "percentage": "44.00%", "elapsed_time": "28m 41s", "remaining_time": "36m 30s"} +{"loss": 0.38040924, "grad_norm": 0.33697807, "learning_rate": 6.085e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063906, "rewards/chosen": 18.625, "rewards/rejected": -7.21875, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -696.0, "logps/chosen": -354.0, "logits/rejected": -0.44921875, "logits/chosen": -0.2109375, "nll_loss": 0.36328125, "epoch": 2.3, "global_step/max_steps": "115/250", "percentage": "46.00%", "elapsed_time": "29m 54s", "remaining_time": "35m 6s"} +{"loss": 0.31162043, "grad_norm": 0.59726256, "learning_rate": 5.759e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064398, "rewards/chosen": 17.25, "rewards/rejected": -1.84375, "rewards/accuracies": 1.0, "rewards/margins": 19.0, "logps/rejected": -326.0, "logps/chosen": -278.0, "logits/rejected": -0.546875, "logits/chosen": -0.43359375, "nll_loss": 0.26757812, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "30m 58s", "remaining_time": "33m 33s"} +{"eval_loss": 0.41699219, "eval_runtime": 7.0281, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -8.125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.5, "eval_logps/rejected": -306.0, "eval_logps/chosen": -5.84375, "eval_logits/rejected": -0.3515625, "eval_logits/chosen": -0.83984375, "eval_nll_loss": 0.27929688, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "31m 5s", "remaining_time": "33m 41s"} +{"loss": 0.32363739, "grad_norm": 0.76199241, "learning_rate": 5.43e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064435, "rewards/chosen": 12.25, "rewards/rejected": -16.0, "rewards/accuracies": 1.0, "rewards/margins": 28.25, "logps/rejected": -1072.0, "logps/chosen": -17.5, "logits/rejected": -0.5078125, "logits/chosen": -0.859375, "nll_loss": 0.21777344, "epoch": 2.5, "global_step/max_steps": "125/250", "percentage": "50.00%", "elapsed_time": "32m 15s", "remaining_time": "32m 15s"} +{"loss": 0.31024857, "grad_norm": 1.41941432, "learning_rate": 5.099e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064846, "rewards/chosen": 19.125, "rewards/rejected": -2.640625, "rewards/accuracies": 1.0, "rewards/margins": 21.75, "logps/rejected": -278.0, "logps/chosen": -304.0, "logits/rejected": -0.66015625, "logits/chosen": -0.52734375, "nll_loss": 0.30078125, "epoch": 2.6, "global_step/max_steps": "130/250", "percentage": "52.00%", "elapsed_time": "33m 20s", "remaining_time": "30m 46s"} +{"loss": 0.29241142, "grad_norm": 1.04337297, "learning_rate": 4.768e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064914, "rewards/chosen": 12.3125, "rewards/rejected": -13.625, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -664.0, "logps/chosen": -116.0, "logits/rejected": -0.34375, "logits/chosen": -0.484375, "nll_loss": 0.12695312, "epoch": 2.7, "global_step/max_steps": "135/250", "percentage": "54.00%", "elapsed_time": "34m 34s", "remaining_time": "29m 27s"} +{"loss": 0.33972807, "grad_norm": 0.42438825, "learning_rate": 4.438e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065209, "rewards/chosen": 18.125, "rewards/rejected": -7.8125, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -684.0, "logps/chosen": -276.0, "logits/rejected": -0.62890625, "logits/chosen": -0.4765625, "nll_loss": 0.49804688, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "35m 42s", "remaining_time": "28m 3s"} +{"eval_loss": 0.41503906, "eval_runtime": 7.0195, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -8.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.75, "eval_logps/rejected": -308.0, "eval_logps/chosen": -5.71875, "eval_logits/rejected": -0.25, "eval_logits/chosen": -0.8125, "eval_nll_loss": 0.27148438, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "35m 49s", "remaining_time": "28m 8s"} +{"loss": 0.34393082, "grad_norm": 1.62611103, "learning_rate": 4.11e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064432, "rewards/chosen": 20.25, "rewards/rejected": -6.90625, "rewards/accuracies": 1.0, "rewards/margins": 27.125, "logps/rejected": -652.0, "logps/chosen": -434.0, "logits/rejected": -0.41796875, "logits/chosen": -0.15917969, "nll_loss": 0.45898438, "epoch": 2.9, "global_step/max_steps": "145/250", "percentage": "58.00%", "elapsed_time": "37m 25s", "remaining_time": "27m 6s"} +{"loss": 0.26485224, "grad_norm": 0.39487302, "learning_rate": 3.786e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064146, "rewards/chosen": 22.0, "rewards/rejected": -1.6640625, "rewards/accuracies": 1.0, "rewards/margins": 23.625, "logps/rejected": -284.0, "logps/chosen": -328.0, "logits/rejected": -0.45117188, "logits/chosen": -0.01367188, "nll_loss": 0.30078125, "epoch": 3.0, "global_step/max_steps": "150/250", "percentage": "60.00%", "elapsed_time": "38m 53s", "remaining_time": "25m 55s"} +{"loss": 0.31949735, "grad_norm": 0.39090196, "learning_rate": 3.468e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064309, "rewards/chosen": 21.125, "rewards/rejected": -11.4375, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -648.0, "logps/chosen": -434.0, "logits/rejected": -0.5234375, "logits/chosen": -0.3671875, "nll_loss": 0.27148438, "epoch": 3.1, "global_step/max_steps": "155/250", "percentage": "62.00%", "elapsed_time": "40m 5s", "remaining_time": "24m 34s"} +{"loss": 0.27110376, "grad_norm": 0.31310931, "learning_rate": 3.156e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064187, "rewards/chosen": 23.75, "rewards/rejected": -0.265625, "rewards/accuracies": 1.0, "rewards/margins": 24.0, "logps/rejected": -382.0, "logps/chosen": -320.0, "logits/rejected": -0.765625, "logits/chosen": -0.47265625, "nll_loss": 0.30078125, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "41m 28s", "remaining_time": "23m 19s"} +{"eval_loss": 0.41845703, "eval_runtime": 7.0385, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -8.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.0, "eval_logps/rejected": -312.0, "eval_logps/chosen": -5.90625, "eval_logits/rejected": -0.3125, "eval_logits/chosen": -0.81640625, "eval_nll_loss": 0.28125, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "41m 35s", "remaining_time": "23m 23s"} +{"loss": 0.26008997, "grad_norm": 0.73653156, "learning_rate": 2.852e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064049, "rewards/chosen": 13.75, "rewards/rejected": -11.5625, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -520.0, "logps/chosen": -120.0, "logits/rejected": -0.35546875, "logits/chosen": -0.57421875, "nll_loss": 0.10498047, "epoch": 3.3, "global_step/max_steps": "165/250", "percentage": "66.00%", "elapsed_time": "42m 51s", "remaining_time": "22m 4s"} +{"loss": 0.22614412, "grad_norm": 0.43675487, "learning_rate": 2.558e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063846, "rewards/chosen": 26.25, "rewards/rejected": 1.578125, "rewards/accuracies": 1.0, "rewards/margins": 24.75, "logps/rejected": -171.0, "logps/chosen": -466.0, "logits/rejected": -0.6328125, "logits/chosen": 0.1796875, "nll_loss": 0.3125, "epoch": 3.4, "global_step/max_steps": "170/250", "percentage": "68.00%", "elapsed_time": "44m 17s", "remaining_time": "20m 50s"} +{"loss": 0.24147272, "grad_norm": 0.57450961, "learning_rate": 2.274e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063902, "rewards/chosen": 24.625, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 28.75, "logps/rejected": -540.0, "logps/chosen": -255.0, "logits/rejected": 0.01257324, "logits/chosen": 0.56640625, "nll_loss": 0.33789062, "epoch": 3.5, "global_step/max_steps": "175/250", "percentage": "70.00%", "elapsed_time": "45m 33s", "remaining_time": "19m 31s"} +{"loss": 0.30578501, "grad_norm": 0.35069974, "learning_rate": 2.002e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064229, "rewards/chosen": 25.5, "rewards/rejected": -2.515625, "rewards/accuracies": 1.0, "rewards/margins": 28.0, "logps/rejected": -284.0, "logps/chosen": -394.0, "logits/rejected": -0.6953125, "logits/chosen": -0.30664062, "nll_loss": 0.43554688, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "46m 37s", "remaining_time": "18m 8s"} +{"eval_loss": 0.43066406, "eval_runtime": 7.0569, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.625, "eval_logps/rejected": -318.0, "eval_logps/chosen": -6.96875, "eval_logits/rejected": -0.30273438, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.33203125, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "46m 44s", "remaining_time": "18m 10s"} +{"loss": 0.26210859, "grad_norm": 0.50263475, "learning_rate": 1.744e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06422, "rewards/chosen": 24.375, "rewards/rejected": -0.4765625, "rewards/accuracies": 1.0, "rewards/margins": 24.875, "logps/rejected": -221.0, "logps/chosen": -360.0, "logits/rejected": -0.47851562, "logits/chosen": -0.13671875, "nll_loss": 0.41796875, "epoch": 3.7, "global_step/max_steps": "185/250", "percentage": "74.00%", "elapsed_time": "47m 56s", "remaining_time": "16m 50s"} +{"loss": 0.25142608, "grad_norm": 0.45140535, "learning_rate": 1.5e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064487, "rewards/chosen": 24.25, "rewards/rejected": -6.90625, "rewards/accuracies": 1.0, "rewards/margins": 31.125, "logps/rejected": -604.0, "logps/chosen": -185.0, "logits/rejected": -0.48632812, "logits/chosen": -0.58203125, "nll_loss": 0.24707031, "epoch": 3.8, "global_step/max_steps": "190/250", "percentage": "76.00%", "elapsed_time": "49m 1s", "remaining_time": "15m 28s"} +{"loss": 0.22136579, "grad_norm": 0.56000726, "learning_rate": 1.271e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064327, "rewards/chosen": 20.75, "rewards/rejected": -10.375, "rewards/accuracies": 1.0, "rewards/margins": 31.125, "logps/rejected": -656.0, "logps/chosen": -332.0, "logits/rejected": -0.21777344, "logits/chosen": 0.47265625, "nll_loss": 0.27539062, "epoch": 3.9, "global_step/max_steps": "195/250", "percentage": "78.00%", "elapsed_time": "50m 26s", "remaining_time": "14m 13s"} +{"loss": 0.24683557, "grad_norm": 0.29154928, "learning_rate": 1.059e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064146, "rewards/chosen": 21.375, "rewards/rejected": -10.25, "rewards/accuracies": 1.0, "rewards/margins": 31.5, "logps/rejected": -668.0, "logps/chosen": -232.0, "logits/rejected": -0.16601562, "logits/chosen": -0.05639648, "nll_loss": 0.25, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "51m 53s", "remaining_time": "12m 58s"} +{"eval_loss": 0.42285156, "eval_runtime": 7.0303, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -9.125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.5, "eval_logps/rejected": -316.0, "eval_logps/chosen": -6.15625, "eval_logits/rejected": -0.3203125, "eval_logits/chosen": -0.8125, "eval_nll_loss": 0.29296875, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "52m 0s", "remaining_time": "13m 0s"} +{"loss": 0.26675062, "grad_norm": 0.50634203, "learning_rate": 8.63e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063779, "rewards/chosen": 22.125, "rewards/rejected": -6.34375, "rewards/accuracies": 1.0, "rewards/margins": 28.5, "logps/rejected": -652.0, "logps/chosen": -272.0, "logits/rejected": -0.21582031, "logits/chosen": -0.2109375, "nll_loss": 0.24121094, "epoch": 4.1, "global_step/max_steps": "205/250", "percentage": "82.00%", "elapsed_time": "53m 29s", "remaining_time": "11m 44s"} +{"loss": 0.20104532, "grad_norm": 0.54961483, "learning_rate": 6.87e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063893, "rewards/chosen": 25.375, "rewards/rejected": -0.03442383, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -238.0, "logps/chosen": -185.0, "logits/rejected": -0.5625, "logits/chosen": -0.26953125, "nll_loss": 0.21191406, "epoch": 4.2, "global_step/max_steps": "210/250", "percentage": "84.00%", "elapsed_time": "54m 42s", "remaining_time": "10m 25s"} +{"loss": 0.241748, "grad_norm": 0.44211747, "learning_rate": 5.29e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06391, "rewards/chosen": 17.625, "rewards/rejected": -19.0, "rewards/accuracies": 1.0, "rewards/margins": 36.75, "logps/rejected": -1128.0, "logps/chosen": -120.0, "logits/rejected": 0.01623535, "logits/chosen": -0.4375, "nll_loss": 0.11572266, "epoch": 4.3, "global_step/max_steps": "215/250", "percentage": "86.00%", "elapsed_time": "55m 59s", "remaining_time": "9m 6s"} +{"loss": 0.22104406, "grad_norm": 0.38452329, "learning_rate": 3.9e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063802, "rewards/chosen": 14.125, "rewards/rejected": -23.625, "rewards/accuracies": 1.0, "rewards/margins": 37.75, "logps/rejected": -1048.0, "logps/chosen": -71.5, "logits/rejected": -0.25, "logits/chosen": -0.50390625, "nll_loss": 0.0559082, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "57m 23s", "remaining_time": "7m 49s"} +{"eval_loss": 0.43017578, "eval_runtime": 7.0342, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.75, "eval_logps/rejected": -320.0, "eval_logps/chosen": -6.78125, "eval_logits/rejected": -0.31640625, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.32421875, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "57m 30s", "remaining_time": "7m 50s"} +{"loss": 0.24149389, "grad_norm": 0.37093545, "learning_rate": 2.72e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063634, "rewards/chosen": 31.75, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 27.375, "logps/rejected": -209.0, "logps/chosen": -476.0, "logits/rejected": -0.62109375, "logits/chosen": -0.20214844, "nll_loss": 0.4921875, "epoch": 4.5, "global_step/max_steps": "225/250", "percentage": "90.00%", "elapsed_time": "58m 51s", "remaining_time": "6m 32s"} +{"loss": 0.20839577, "grad_norm": 0.42350659, "learning_rate": 1.75e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063609, "rewards/chosen": 24.125, "rewards/rejected": -11.4375, "rewards/accuracies": 1.0, "rewards/margins": 35.5, "logps/rejected": -804.0, "logps/chosen": -210.0, "logits/rejected": -0.53125, "logits/chosen": 0.09521484, "nll_loss": 0.265625, "epoch": 4.6, "global_step/max_steps": "230/250", "percentage": "92.00%", "elapsed_time": "1h 0m 11s", "remaining_time": "5m 14s"} +{"loss": 0.17243645, "grad_norm": 0.74527961, "learning_rate": 9.9e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063676, "rewards/chosen": 17.375, "rewards/rejected": -18.5, "rewards/accuracies": 1.0, "rewards/margins": 35.75, "logps/rejected": -772.0, "logps/chosen": -153.0, "logits/rejected": -0.24707031, "logits/chosen": -0.5078125, "nll_loss": 0.1875, "epoch": 4.7, "global_step/max_steps": "235/250", "percentage": "94.00%", "elapsed_time": "1h 1m 25s", "remaining_time": "3m 55s"} +{"loss": 0.2119802, "grad_norm": 0.31765565, "learning_rate": 4.4e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063884, "rewards/chosen": 28.125, "rewards/rejected": -6.71875, "rewards/accuracies": 1.0, "rewards/margins": 35.0, "logps/rejected": -664.0, "logps/chosen": -308.0, "logits/rejected": -0.640625, "logits/chosen": -0.45117188, "nll_loss": 0.44140625, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "1h 2m 32s", "remaining_time": "2m 36s"} +{"eval_loss": 0.43310547, "eval_runtime": 7.1052, "eval_samples_per_second": 0.563, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.75, "eval_logps/rejected": -320.0, "eval_logps/chosen": -7.0, "eval_logits/rejected": -0.31445312, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.33398438, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "1h 2m 39s", "remaining_time": "2m 36s"} +{"loss": 0.25767584, "grad_norm": 0.44758291, "learning_rate": 1.1e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063828, "rewards/chosen": 24.875, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 27.0, "logps/rejected": -520.0, "logps/chosen": -416.0, "logits/rejected": -0.609375, "logits/chosen": -0.2734375, "nll_loss": 0.41796875, "epoch": 4.9, "global_step/max_steps": "245/250", "percentage": "98.00%", "elapsed_time": "1h 3m 53s", "remaining_time": "1m 18s"} +{"loss": 0.29092078, "grad_norm": 0.30988696, "learning_rate": 0.0, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064005, "rewards/chosen": 23.25, "rewards/rejected": -9.375, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -508.0, "logps/chosen": -274.0, "logits/rejected": -0.6328125, "logits/chosen": -0.5, "nll_loss": 0.30078125, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 5m 1s", "remaining_time": "0s"} +{"eval_loss": 0.43066406, "eval_runtime": 7.0302, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 18.0, "eval_logps/rejected": -322.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": -0.31640625, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.32617188, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 5m 8s", "remaining_time": "0s"} +{"train_runtime": 3912.3378, "train_samples_per_second": 0.505, "train_steps_per_second": 0.064, "total_flos": 1376108982829056.0, "train_loss": 0.42191185, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 5m 12s", "remaining_time": "0s"} +{"train_dataset": "1149.291139±483.948641, min=300.000000, max=4044.000000, size=395", "val_dataset": "1146.000000±535.563255, min=640.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 70657.2534M Params (103.5469M Trainable [0.1465%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-250", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/checkpoint-100", "best_metric": 0.40966797, "global_step": 250, "log_history": [{"loss": 2.16796875, "grad_norm": 13.129363871319395, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 28.09, "train_speed(iter/s)": 0.050045, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -156.0, "logps/chosen": -490.0, "logits/rejected": -0.50390625, "logits/chosen": -0.494140625, "nll_loss": 0.50390625, "epoch": 0.02, "step": 1}, {"loss": 2.149658203125, "grad_norm": 13.47369483855226, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 30.65, "train_speed(iter/s)": 0.070826, "rewards/chosen": -0.125, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": -0.125, "logps/rejected": -232.0, "logps/chosen": -568.0, "logits/rejected": 0.09326171875, "logits/chosen": 1.5078125, "nll_loss": 0.75390625, "epoch": 0.1, "step": 5}, {"loss": 1.72822265625, "grad_norm": 9.281799302422854, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 35.11, "train_speed(iter/s)": 0.065681, "rewards/chosen": 0.609375, "rewards/rejected": 0.12060546875, "rewards/accuracies": 0.6000000238418579, "rewards/margins": 0.48828125, "logps/rejected": -458.0, "logps/chosen": -478.0, "logits/rejected": -0.38671875, "logits/chosen": -0.6015625, "nll_loss": 1.125, "epoch": 0.2, "step": 10}, {"loss": 1.05458984375, "grad_norm": 4.394517104497035, "learning_rate": 9.998242976313776e-05, "memory(GiB)": 57.44, "train_speed(iter/s)": 0.061064, "rewards/chosen": 4.5625, "rewards/rejected": 1.703125, "rewards/accuracies": 0.800000011920929, "rewards/margins": 2.859375, "logps/rejected": -928.0, "logps/chosen": -444.0, "logits/rejected": 0.04931640625, "logits/chosen": -0.212890625, "nll_loss": 0.6328125, "epoch": 0.3, "step": 15}, {"loss": 0.703759765625, "grad_norm": 0.5300094805075983, "learning_rate": 9.97849063861667e-05, "memory(GiB)": 57.44, "train_speed(iter/s)": 0.062406, "rewards/chosen": 6.4375, "rewards/rejected": 1.546875, "rewards/accuracies": 1.0, "rewards/margins": 4.90625, "logps/rejected": -318.0, "logps/chosen": -300.0, "logits/rejected": -0.431640625, "logits/chosen": -0.6015625, "nll_loss": 0.4765625, "epoch": 0.4, "step": 20}, {"eval_loss": 0.53662109375, "eval_runtime": 7.0778, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 8.0, "eval_rewards/rejected": 3.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.09375, "eval_logps/rejected": -186.0, "eval_logps/chosen": -9.875, "eval_logits/rejected": -0.3828125, "eval_logits/chosen": -0.953125, "eval_nll_loss": 0.470703125, "epoch": 0.4, "step": 20}, {"loss": 0.5664306640625, "grad_norm": 0.5833158966477413, "learning_rate": 9.936876709681668e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.061629, "rewards/chosen": 10.125, "rewards/rejected": 0.0810546875, "rewards/accuracies": 1.0, "rewards/margins": 10.0625, "logps/rejected": -394.0, "logps/chosen": -255.0, "logits/rejected": -0.34765625, "logits/chosen": -0.466796875, "nll_loss": 0.412109375, "epoch": 0.5, "step": 25}, {"loss": 0.518994140625, "grad_norm": 0.5069545206823683, "learning_rate": 9.873583924954152e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.063686, "rewards/chosen": 10.3125, "rewards/rejected": 0.6015625, "rewards/accuracies": 1.0, "rewards/margins": 9.6875, "logps/rejected": -584.0, "logps/chosen": -312.0, "logits/rejected": -0.63671875, "logits/chosen": -0.81640625, "nll_loss": 0.65234375, "epoch": 0.6, "step": 30}, {"loss": 0.51473388671875, "grad_norm": 1.0656827195195997, "learning_rate": 9.788890216258939e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.065515, "rewards/chosen": 15.0625, "rewards/rejected": 2.671875, "rewards/accuracies": 1.0, "rewards/margins": 12.375, "logps/rejected": -214.0, "logps/chosen": -420.0, "logits/rejected": -0.60546875, "logits/chosen": -0.21875, "nll_loss": 0.68359375, "epoch": 0.7, "step": 35}, {"loss": 0.45928955078125, "grad_norm": 0.6780020698373402, "learning_rate": 9.68316749134364e-05, "memory(GiB)": 61.35, "train_speed(iter/s)": 0.064272, "rewards/chosen": 11.875, "rewards/rejected": -0.53515625, "rewards/accuracies": 1.0, "rewards/margins": 12.375, "logps/rejected": -368.0, "logps/chosen": -174.0, "logits/rejected": -0.640625, "logits/chosen": -0.6171875, "nll_loss": 0.52734375, "epoch": 0.8, "step": 40}, {"eval_loss": 0.427734375, "eval_runtime": 7.0584, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -1.8984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.3125, "eval_logps/rejected": -244.0, "eval_logps/chosen": -5.4375, "eval_logits/rejected": -0.38671875, "eval_logits/chosen": -0.859375, "eval_nll_loss": 0.259765625, "epoch": 0.8, "step": 40}, {"loss": 0.4487548828125, "grad_norm": 0.7227421522145911, "learning_rate": 9.55688000075414e-05, "memory(GiB)": 70.32, "train_speed(iter/s)": 0.063461, "rewards/chosen": 17.75, "rewards/rejected": -0.259765625, "rewards/accuracies": 1.0, "rewards/margins": 18.0, "logps/rejected": -446.0, "logps/chosen": -376.0, "logits/rejected": -0.88671875, "logits/chosen": -0.46484375, "nll_loss": 0.4296875, "epoch": 0.9, "step": 45}, {"loss": 0.442578125, "grad_norm": 0.9252536379780557, "learning_rate": 9.410582299213573e-05, "memory(GiB)": 72.28, "train_speed(iter/s)": 0.064239, "rewards/chosen": 12.5, "rewards/rejected": -0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 13.0, "logps/rejected": -1016.0, "logps/chosen": -384.0, "logits/rejected": -0.07275390625, "logits/chosen": -0.1796875, "nll_loss": 0.40234375, "epoch": 1.0, "step": 50}, {"loss": 0.4554443359375, "grad_norm": 0.5443623347983845, "learning_rate": 9.244916810456821e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064341, "rewards/chosen": 19.0, "rewards/rejected": -1.8984375, "rewards/accuracies": 1.0, "rewards/margins": 21.0, "logps/rejected": -636.0, "logps/chosen": -310.0, "logits/rejected": -0.490234375, "logits/chosen": 0.01220703125, "nll_loss": 0.4609375, "epoch": 1.1, "step": 55}, {"loss": 0.41307373046875, "grad_norm": 0.9491701785504059, "learning_rate": 9.060611006213832e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064763, "rewards/chosen": 13.0625, "rewards/rejected": -1.1953125, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -532.0, "logps/chosen": -173.0, "logits/rejected": -0.37109375, "logits/chosen": -0.34765625, "nll_loss": 0.310546875, "epoch": 1.2, "step": 60}, {"eval_loss": 0.41845703125, "eval_runtime": 7.374, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.136, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.9375, "eval_logps/rejected": -270.0, "eval_logps/chosen": -5.46875, "eval_logits/rejected": -0.32421875, "eval_logits/chosen": -0.8203125, "eval_nll_loss": 0.259765625, "epoch": 1.2, "step": 60}, {"loss": 0.461590576171875, "grad_norm": 0.6746923195914712, "learning_rate": 8.858474211729469e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064043, "rewards/chosen": 13.0, "rewards/rejected": -12.625, "rewards/accuracies": 1.0, "rewards/margins": 25.625, "logps/rejected": -820.0, "logps/chosen": -249.0, "logits/rejected": -0.1865234375, "logits/chosen": -0.32421875, "nll_loss": 0.462890625, "epoch": 1.3, "step": 65}, {"loss": 0.37837066650390627, "grad_norm": 0.4557775784440204, "learning_rate": 8.639394051847472e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064771, "rewards/chosen": 10.0, "rewards/rejected": -24.75, "rewards/accuracies": 1.0, "rewards/margins": 34.75, "logps/rejected": -1112.0, "logps/chosen": -47.0, "logits/rejected": -0.51953125, "logits/chosen": -0.73828125, "nll_loss": 0.4921875, "epoch": 1.4, "step": 70}, {"loss": 0.4122833251953125, "grad_norm": 0.4549474900285614, "learning_rate": 8.404332553264547e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064901, "rewards/chosen": 16.75, "rewards/rejected": -10.3125, "rewards/accuracies": 1.0, "rewards/margins": 27.0, "logps/rejected": -478.0, "logps/chosen": -350.0, "logits/rejected": -0.6640625, "logits/chosen": -0.33984375, "nll_loss": 0.361328125, "epoch": 1.5, "step": 75}, {"loss": 0.42242431640625, "grad_norm": 0.41018476502679196, "learning_rate": 8.154321920070414e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065678, "rewards/chosen": 22.0, "rewards/rejected": 0.1708984375, "rewards/accuracies": 1.0, "rewards/margins": 21.875, "logps/rejected": -153.0, "logps/chosen": -428.0, "logits/rejected": -0.439453125, "logits/chosen": 0.2255859375, "nll_loss": 0.5078125, "epoch": 1.6, "step": 80}, {"eval_loss": 0.41748046875, "eval_runtime": 7.0481, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -16.25, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 24.75, "eval_logps/rejected": -388.0, "eval_logps/chosen": -5.625, "eval_logits/rejected": -0.326171875, "eval_logits/chosen": -0.82421875, "eval_nll_loss": 0.267578125, "epoch": 1.6, "step": 80}, {"loss": 0.54422607421875, "grad_norm": 0.6576496300320993, "learning_rate": 7.890460001124242e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065121, "rewards/chosen": 13.5625, "rewards/rejected": -14.875, "rewards/accuracies": 1.0, "rewards/margins": 28.5, "logps/rejected": -796.0, "logps/chosen": -217.0, "logits/rejected": -0.5078125, "logits/chosen": -0.33984375, "nll_loss": 0.30859375, "epoch": 1.7, "step": 85}, {"loss": 0.39871826171875, "grad_norm": 0.6545271461639326, "learning_rate": 7.613905469171246e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064715, "rewards/chosen": 14.1875, "rewards/rejected": -8.875, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -532.0, "logps/chosen": -235.0, "logits/rejected": -0.73828125, "logits/chosen": -0.375, "nll_loss": 0.45703125, "epoch": 1.8, "step": 90}, {"loss": 0.41851806640625, "grad_norm": 0.3935793421122154, "learning_rate": 7.325872732868869e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06466, "rewards/chosen": 17.25, "rewards/rejected": -9.25, "rewards/accuracies": 1.0, "rewards/margins": 26.5, "logps/rejected": -564.0, "logps/chosen": -282.0, "logits/rejected": -0.328125, "logits/chosen": -0.328125, "nll_loss": 0.37109375, "epoch": 1.9, "step": 95}, {"loss": 0.38671875, "grad_norm": 0.36803625473665347, "learning_rate": 7.027626604064969e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064448, "rewards/chosen": 13.6875, "rewards/rejected": -5.375, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -384.0, "logps/chosen": -212.0, "logits/rejected": -0.42578125, "logits/chosen": -0.390625, "nll_loss": 0.3359375, "epoch": 2.0, "step": 100}, {"eval_loss": 0.40966796875, "eval_runtime": 6.9622, "eval_samples_per_second": 0.575, "eval_steps_per_second": 0.144, "eval_rewards/chosen": 8.5, "eval_rewards/rejected": -7.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.375, "eval_logps/rejected": -304.0, "eval_logps/chosen": -5.125, "eval_logits/rejected": -0.45703125, "eval_logits/chosen": -0.87109375, "eval_nll_loss": 0.244140625, "epoch": 2.0, "step": 100}, {"loss": 0.3717987060546875, "grad_norm": 0.5015689962645361, "learning_rate": 6.720476743745072e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063793, "rewards/chosen": 20.875, "rewards/rejected": -5.71875, "rewards/accuracies": 1.0, "rewards/margins": 26.625, "logps/rejected": -640.0, "logps/chosen": -332.0, "logits/rejected": -0.7109375, "logits/chosen": -0.396484375, "nll_loss": 0.435546875, "epoch": 2.1, "step": 105}, {"loss": 0.3555938720703125, "grad_norm": 0.6199367674381827, "learning_rate": 6.405771911037699e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063742, "rewards/chosen": 15.9375, "rewards/rejected": -16.5, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -1040.0, "logps/chosen": -306.0, "logits/rejected": -0.25, "logits/chosen": -0.291015625, "nll_loss": 0.33203125, "epoch": 2.2, "step": 110}, {"loss": 0.38040924072265625, "grad_norm": 0.3369780668254126, "learning_rate": 6.08489404053159e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063906, "rewards/chosen": 18.625, "rewards/rejected": -7.21875, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -696.0, "logps/chosen": -354.0, "logits/rejected": -0.44921875, "logits/chosen": -0.2109375, "nll_loss": 0.36328125, "epoch": 2.3, "step": 115}, {"loss": 0.3116204261779785, "grad_norm": 0.5972625645626695, "learning_rate": 5.7592521739125726e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064398, "rewards/chosen": 17.25, "rewards/rejected": -1.84375, "rewards/accuracies": 1.0, "rewards/margins": 19.0, "logps/rejected": -326.0, "logps/chosen": -278.0, "logits/rejected": -0.546875, "logits/chosen": -0.43359375, "nll_loss": 0.267578125, "epoch": 2.4, "step": 120}, {"eval_loss": 0.4169921875, "eval_runtime": 7.0281, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -8.125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.5, "eval_logps/rejected": -306.0, "eval_logps/chosen": -5.84375, "eval_logits/rejected": -0.3515625, "eval_logits/chosen": -0.83984375, "eval_nll_loss": 0.279296875, "epoch": 2.4, "step": 120}, {"loss": 0.32363739013671877, "grad_norm": 0.7619924076423551, "learning_rate": 5.430276272567485e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064435, "rewards/chosen": 12.25, "rewards/rejected": -16.0, "rewards/accuracies": 1.0, "rewards/margins": 28.25, "logps/rejected": -1072.0, "logps/chosen": -17.5, "logits/rejected": -0.5078125, "logits/chosen": -0.859375, "nll_loss": 0.2177734375, "epoch": 2.5, "step": 125}, {"loss": 0.31024856567382814, "grad_norm": 1.419414316974697, "learning_rate": 5.0994109383253506e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064846, "rewards/chosen": 19.125, "rewards/rejected": -2.640625, "rewards/accuracies": 1.0, "rewards/margins": 21.75, "logps/rejected": -278.0, "logps/chosen": -304.0, "logits/rejected": -0.66015625, "logits/chosen": -0.52734375, "nll_loss": 0.30078125, "epoch": 2.6, "step": 130}, {"loss": 0.29241142272949217, "grad_norm": 1.043372967452423, "learning_rate": 4.768109069909307e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064914, "rewards/chosen": 12.3125, "rewards/rejected": -13.625, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -664.0, "logps/chosen": -116.0, "logits/rejected": -0.34375, "logits/chosen": -0.484375, "nll_loss": 0.126953125, "epoch": 2.7, "step": 135}, {"loss": 0.33972806930541993, "grad_norm": 0.424388253912613, "learning_rate": 4.4378254829551396e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.065209, "rewards/chosen": 18.125, "rewards/rejected": -7.8125, "rewards/accuracies": 1.0, "rewards/margins": 26.0, "logps/rejected": -684.0, "logps/chosen": -276.0, "logits/rejected": -0.62890625, "logits/chosen": -0.4765625, "nll_loss": 0.498046875, "epoch": 2.8, "step": 140}, {"eval_loss": 0.4150390625, "eval_runtime": 7.0195, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.4375, "eval_rewards/rejected": -8.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.75, "eval_logps/rejected": -308.0, "eval_logps/chosen": -5.71875, "eval_logits/rejected": -0.25, "eval_logits/chosen": -0.8125, "eval_nll_loss": 0.271484375, "epoch": 2.8, "step": 140}, {"loss": 0.3439308166503906, "grad_norm": 1.6261110291294545, "learning_rate": 4.11001052161225e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064432, "rewards/chosen": 20.25, "rewards/rejected": -6.90625, "rewards/accuracies": 1.0, "rewards/margins": 27.125, "logps/rejected": -652.0, "logps/chosen": -434.0, "logits/rejected": -0.41796875, "logits/chosen": -0.1591796875, "nll_loss": 0.458984375, "epoch": 2.9, "step": 145}, {"loss": 0.264852237701416, "grad_norm": 0.3948730156432553, "learning_rate": 3.786103689779861e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064146, "rewards/chosen": 22.0, "rewards/rejected": -1.6640625, "rewards/accuracies": 1.0, "rewards/margins": 23.625, "logps/rejected": -284.0, "logps/chosen": -328.0, "logits/rejected": -0.451171875, "logits/chosen": -0.013671875, "nll_loss": 0.30078125, "epoch": 3.0, "step": 150}, {"loss": 0.31949734687805176, "grad_norm": 0.39090195989326426, "learning_rate": 3.467527329945026e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064309, "rewards/chosen": 21.125, "rewards/rejected": -11.4375, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -648.0, "logps/chosen": -434.0, "logits/rejected": -0.5234375, "logits/chosen": -0.3671875, "nll_loss": 0.271484375, "epoch": 3.1, "step": 155}, {"loss": 0.2711037635803223, "grad_norm": 0.31310930959607486, "learning_rate": 3.1556803773799614e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064187, "rewards/chosen": 23.75, "rewards/rejected": -0.265625, "rewards/accuracies": 1.0, "rewards/margins": 24.0, "logps/rejected": -382.0, "logps/chosen": -320.0, "logits/rejected": -0.765625, "logits/chosen": -0.47265625, "nll_loss": 0.30078125, "epoch": 3.2, "step": 160}, {"eval_loss": 0.41845703125, "eval_runtime": 7.0385, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -8.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.0, "eval_logps/rejected": -312.0, "eval_logps/chosen": -5.90625, "eval_logits/rejected": -0.3125, "eval_logits/chosen": -0.81640625, "eval_nll_loss": 0.28125, "epoch": 3.2, "step": 160}, {"loss": 0.26008996963500974, "grad_norm": 0.7365315552744873, "learning_rate": 2.8519322171253602e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064049, "rewards/chosen": 13.75, "rewards/rejected": -11.5625, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -520.0, "logps/chosen": -120.0, "logits/rejected": -0.35546875, "logits/chosen": -0.57421875, "nll_loss": 0.10498046875, "epoch": 3.3, "step": 165}, {"loss": 0.22614412307739257, "grad_norm": 0.4367548710430112, "learning_rate": 2.5576166707349385e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063846, "rewards/chosen": 26.25, "rewards/rejected": 1.578125, "rewards/accuracies": 1.0, "rewards/margins": 24.75, "logps/rejected": -171.0, "logps/chosen": -466.0, "logits/rejected": -0.6328125, "logits/chosen": 0.1796875, "nll_loss": 0.3125, "epoch": 3.4, "step": 170}, {"loss": 0.24147272109985352, "grad_norm": 0.5745096147296118, "learning_rate": 2.2740261391866637e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063902, "rewards/chosen": 24.625, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 28.75, "logps/rejected": -540.0, "logps/chosen": -255.0, "logits/rejected": 0.0125732421875, "logits/chosen": 0.56640625, "nll_loss": 0.337890625, "epoch": 3.5, "step": 175}, {"loss": 0.30578501224517823, "grad_norm": 0.3506997416185766, "learning_rate": 2.002405927680374e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064229, "rewards/chosen": 25.5, "rewards/rejected": -2.515625, "rewards/accuracies": 1.0, "rewards/margins": 28.0, "logps/rejected": -284.0, "logps/chosen": -394.0, "logits/rejected": -0.6953125, "logits/chosen": -0.306640625, "nll_loss": 0.435546875, "epoch": 3.6, "step": 180}, {"eval_loss": 0.4306640625, "eval_runtime": 7.0569, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.3125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.625, "eval_logps/rejected": -318.0, "eval_logps/chosen": -6.96875, "eval_logits/rejected": -0.302734375, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.33203125, "epoch": 3.6, "step": 180}, {"loss": 0.262108588218689, "grad_norm": 0.5026347543849498, "learning_rate": 1.743948777242814e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06422, "rewards/chosen": 24.375, "rewards/rejected": -0.4765625, "rewards/accuracies": 1.0, "rewards/margins": 24.875, "logps/rejected": -221.0, "logps/chosen": -360.0, "logits/rejected": -0.478515625, "logits/chosen": -0.13671875, "nll_loss": 0.41796875, "epoch": 3.7, "step": 185}, {"loss": 0.25142607688903806, "grad_norm": 0.45140535411962257, "learning_rate": 1.4997896271528739e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064487, "rewards/chosen": 24.25, "rewards/rejected": -6.90625, "rewards/accuracies": 1.0, "rewards/margins": 31.125, "logps/rejected": -604.0, "logps/chosen": -185.0, "logits/rejected": -0.486328125, "logits/chosen": -0.58203125, "nll_loss": 0.2470703125, "epoch": 3.8, "step": 190}, {"loss": 0.2213657855987549, "grad_norm": 0.560007261391276, "learning_rate": 1.2710006311864104e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064327, "rewards/chosen": 20.75, "rewards/rejected": -10.375, "rewards/accuracies": 1.0, "rewards/margins": 31.125, "logps/rejected": -656.0, "logps/chosen": -332.0, "logits/rejected": -0.2177734375, "logits/chosen": 0.47265625, "nll_loss": 0.275390625, "epoch": 3.9, "step": 195}, {"loss": 0.2468355655670166, "grad_norm": 0.2915492783630129, "learning_rate": 1.0585864495652897e-05, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064146, "rewards/chosen": 21.375, "rewards/rejected": -10.25, "rewards/accuracies": 1.0, "rewards/margins": 31.5, "logps/rejected": -668.0, "logps/chosen": -232.0, "logits/rejected": -0.166015625, "logits/chosen": -0.056396484375, "nll_loss": 0.25, "epoch": 4.0, "step": 200}, {"eval_loss": 0.4228515625, "eval_runtime": 7.0303, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.375, "eval_rewards/rejected": -9.125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.5, "eval_logps/rejected": -316.0, "eval_logps/chosen": -6.15625, "eval_logits/rejected": -0.3203125, "eval_logits/chosen": -0.8125, "eval_nll_loss": 0.29296875, "epoch": 4.0, "step": 200}, {"loss": 0.2667506217956543, "grad_norm": 0.506342030165448, "learning_rate": 8.634798372847148e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063779, "rewards/chosen": 22.125, "rewards/rejected": -6.34375, "rewards/accuracies": 1.0, "rewards/margins": 28.5, "logps/rejected": -652.0, "logps/chosen": -272.0, "logits/rejected": -0.2158203125, "logits/chosen": -0.2109375, "nll_loss": 0.2412109375, "epoch": 4.1, "step": 205}, {"loss": 0.20104532241821288, "grad_norm": 0.5496148316299561, "learning_rate": 6.865375481914016e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063893, "rewards/chosen": 25.375, "rewards/rejected": -0.034423828125, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -238.0, "logps/chosen": -185.0, "logits/rejected": -0.5625, "logits/chosen": -0.26953125, "nll_loss": 0.2119140625, "epoch": 4.2, "step": 210}, {"loss": 0.24174799919128417, "grad_norm": 0.44211747113636407, "learning_rate": 5.285365727986707e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.06391, "rewards/chosen": 17.625, "rewards/rejected": -19.0, "rewards/accuracies": 1.0, "rewards/margins": 36.75, "logps/rejected": -1128.0, "logps/chosen": -120.0, "logits/rejected": 0.0162353515625, "logits/chosen": -0.4375, "nll_loss": 0.11572265625, "epoch": 4.3, "step": 215}, {"loss": 0.22104406356811523, "grad_norm": 0.38452329355351084, "learning_rate": 3.901707263589671e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063802, "rewards/chosen": 14.125, "rewards/rejected": -23.625, "rewards/accuracies": 1.0, "rewards/margins": 37.75, "logps/rejected": -1048.0, "logps/chosen": -71.5, "logits/rejected": -0.25, "logits/chosen": -0.50390625, "nll_loss": 0.055908203125, "epoch": 4.4, "step": 220}, {"eval_loss": 0.43017578125, "eval_runtime": 7.0342, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.75, "eval_logps/rejected": -320.0, "eval_logps/chosen": -6.78125, "eval_logits/rejected": -0.31640625, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.32421875, "epoch": 4.4, "step": 220}, {"loss": 0.24149389266967775, "grad_norm": 0.3709354527062318, "learning_rate": 2.7204760217631074e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063634, "rewards/chosen": 31.75, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 27.375, "logps/rejected": -209.0, "logps/chosen": -476.0, "logits/rejected": -0.62109375, "logits/chosen": -0.2021484375, "nll_loss": 0.4921875, "epoch": 4.5, "step": 225}, {"loss": 0.20839576721191405, "grad_norm": 0.4235065854735603, "learning_rate": 1.7468590353731495e-06, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063609, "rewards/chosen": 24.125, "rewards/rejected": -11.4375, "rewards/accuracies": 1.0, "rewards/margins": 35.5, "logps/rejected": -804.0, "logps/chosen": -210.0, "logits/rejected": -0.53125, "logits/chosen": 0.09521484375, "nll_loss": 0.265625, "epoch": 4.6, "step": 230}, {"loss": 0.17243645191192628, "grad_norm": 0.7452796123378559, "learning_rate": 9.851316597681958e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063676, "rewards/chosen": 17.375, "rewards/rejected": -18.5, "rewards/accuracies": 1.0, "rewards/margins": 35.75, "logps/rejected": -772.0, "logps/chosen": -153.0, "logits/rejected": -0.2470703125, "logits/chosen": -0.5078125, "nll_loss": 0.1875, "epoch": 4.7, "step": 235}, {"loss": 0.21198019981384278, "grad_norm": 0.31765564689044157, "learning_rate": 4.386387988014273e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063884, "rewards/chosen": 28.125, "rewards/rejected": -6.71875, "rewards/accuracies": 1.0, "rewards/margins": 35.0, "logps/rejected": -664.0, "logps/chosen": -308.0, "logits/rejected": -0.640625, "logits/chosen": -0.451171875, "nll_loss": 0.44140625, "epoch": 4.8, "step": 240}, {"eval_loss": 0.43310546875, "eval_runtime": 7.1052, "eval_samples_per_second": 0.563, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 17.75, "eval_logps/rejected": -320.0, "eval_logps/chosen": -7.0, "eval_logits/rejected": -0.314453125, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.333984375, "epoch": 4.8, "step": 240}, {"loss": 0.257675838470459, "grad_norm": 0.44758290651304805, "learning_rate": 1.0978021666005478e-07, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.063828, "rewards/chosen": 24.875, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 27.0, "logps/rejected": -520.0, "logps/chosen": -416.0, "logits/rejected": -0.609375, "logits/chosen": -0.2734375, "nll_loss": 0.41796875, "epoch": 4.9, "step": 245}, {"loss": 0.2909207820892334, "grad_norm": 0.3098869617371432, "learning_rate": 0.0, "memory(GiB)": 77.63, "train_speed(iter/s)": 0.064005, "rewards/chosen": 23.25, "rewards/rejected": -9.375, "rewards/accuracies": 1.0, "rewards/margins": 32.5, "logps/rejected": -508.0, "logps/chosen": -274.0, "logits/rejected": -0.6328125, "logits/chosen": -0.5, "nll_loss": 0.30078125, "epoch": 5.0, "step": 250}, {"eval_loss": 0.4306640625, "eval_runtime": 7.0302, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 8.3125, "eval_rewards/rejected": -9.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 18.0, "eval_logps/rejected": -322.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": -0.31640625, "eval_logits/chosen": -0.80859375, "eval_nll_loss": 0.326171875, "epoch": 5.0, "step": 250}, {"train_runtime": 3912.3378, "train_samples_per_second": 0.505, "train_steps_per_second": 0.064, "total_flos": 1376108982829056.0, "train_loss": 0.42191185140609744, "epoch": 5.0, "step": 250}], "memory": 77.630859375} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs/events.out.tfevents.1737739537.kml-task-547024-record-9965643-prod-worker-0.69729.0 b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs/events.out.tfevents.1737739537.kml-task-547024-record-9965643-prod-worker-0.69729.0 new file mode 100644 index 0000000000000000000000000000000000000000..800a0832c08f24f0d2cd500a392a3dedda1798dd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250124-172327/runs/events.out.tfevents.1737739537.kml-task-547024-record-9965643-prod-worker-0.69729.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61795d7821e89e4c5c141382c35543ee7d5b0086a65ab62cdba50b564ede6a1c +size 61864 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..26ce47bcfdff75e3a951618dd83d6977e89ccd98 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f5db6fd0687751d575aed7a33e690472cc9e6c427b846c29e8e839defb7cc4f +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cf0b8ca01db52179207095748fa1867078e9c56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5d8bcf308c1b8f82f7e860c1ff746c9371e134a2843494d1fb92cfde97eb7c +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c22bf20c8741fcc2a19e68e6b4d5977b54579ce --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ab565cd8eecf8386f202a87daf26b243f7a7f6394d6c0310e889a0f33330cf2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f67598d276ad3643456c6969adb60b37d5b2853 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d1d5df5e8b89385d018fd488896b834538378e2c8a0e5c7cd0dd3e6c12617f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0abae6bb363426e2995c2ac063dcd902a84ca165 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a40f1453d7134bbf11d410b8eaa78ae83e5f4684c875a0e827952ce70c8128e2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3429a1cd66fde62710f9120fe590ebfd9d951e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47974050047680a3ecaf60659ab22586d830141b54221a95affec1545d361c8b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce9e3056b92224831ee32bb89b44ba63dc8cccd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad5800f40623af53556d479574371a49f171f749f3f216a8042cd2618f646bf +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc5e59fcbd5966c9fca5e558c724792d5c75feba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:111c543483b62bd33137d30182f09072cc766af6671cbf68114f62a8044c2b22 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b4aca585d592fd104b44c3253276cd39ca6f02f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c826562697ef9c474784bbddb74436ea1e9d0f8135e987871d2fb98709b9dd6 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f822501d5822397820601b52ff19c5d7164b4295 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abeee0f28b3a7ef55b5c677b5cd04c62d917bc78a6582decbe1aa447e160a2f5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a46b5e6da55796b08c4691a6f95361cfa8079613 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0f6092ad4b400c3a6ac5abe25dd94720e4ec2ecf8310c5127abdcaae873da1 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1607942f64be77f4685e08b0060a97e01c5580d9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ff5de98f157f7431052e5abe06c8e2ab4b42138fd91c88826cb56daa9169b4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c07dd0b3211ec242c9c70c75637615e4becb8d53 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104289d59ab4af2627c4cc06bc9aacbd3faf772d070c27fafe6161297f7e626c +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a0697106a13a064acf5fd016458c96aa42efb9e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82d35c567a72766be728a58b2b08eadace3eaf54b03f59b8ed041285b596005d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf6dde6f85851ffabd0dea5c365203d294aa920b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3d4908a56f06e3481f6b86b199a8834c11c93a761d0e8f5617b81fce72a990 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8828f3921e32ee5e5f7dfe51ae199ba3d78d1d65 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6088d952c20a745cc58c2f0a27239d92f4bc9d15366ef61dd6f131c0e809669 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9c2b8558f08875f7c436ae4222aa0725aa770ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3723ddeb824d63b389a6f562ee4b19f093efad28ce878f8e143cd082f80f0c66 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e44eb3fa45a557460f92c098f2fd1eccee781e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e2e2737d63e3807ccb5977a1e9e37b3ec45ffad7bcc727e9e137aebc01368 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0714a2f9d56c02b50246811c8a156cef4d462ac8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.41210938, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 544165762433024.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..120fbd14d67e0feaf5b7da163a4e57d206fbb198 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f0345aa7d9ccadf83f6f43d5a537171f245bcadfe70bc2638b3a3f5a491367c +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eee2c093f322d88b1a337894fde8dd1b295ce206 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2d6480c48995f49f65b4ed4af08c3b47cd873827ac4020fa8d20d04d34a12f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d6ccacfef687d479f06a0150c00fff6ca634468 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9785717b6052a95e539277676ffa7e675f1cccfbb18c2b5c39560f7647a2ac41 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f64b7367a92f00b27f3e60f7fa17bed9965a9b33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6128d880360ded3eee62b35003012a3037f388037fa0eeb0c59572e2d2c8e0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e49d7e977e7834c89cba3b7777981cdba850f766 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:035f918f04a327c0ba4594e7e73119edbcdafc3f968420a445c00500dbcd2d50 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cb985fb6b5a097a1e5ec3a72522f31eb444b2fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c46c0dab89958c7bdad3d96d2babecf979a04aefb4a00caff9852c7b74f9703 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59fec8b03346b36bb98791fbf3f0a21cc53e730a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060027a2d9c87fa7ac2ecb20b0fbb1a708d89329f8157e95ca3b99eb73e78feb +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bfb591134f623edbee5c7426f867577bae6857d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704fd2bebb6331cb4f5e83732b110959f475079180c8ed00fd1f8709a60bbefd +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fbfdc5f6dcd49d87d810140a28e0fe8c3c6901d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1117abb9f4b15dc90aff0f1a5214e69eb69e23f477974133cd13f19d704ddf6 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8eec0fe9ce19cd23cc07f36a4ca8cbae86e73cf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e3510aaacf5039e060f4468f2f2d158717c1589acfeef61a474c7d8c6c152bd +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99bd15063823ba5bd4accc2313a867c5150fd69b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74adc72ee1cc40690fd1672e4922db0f1c44c2779468bf6c4bd3b870c4f5878 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81fea5e70392376c852ab4576d92a179abdd59be --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b43e495bc25ef506c084ed733c9d59b39109254779ee59d468379bbad4ff015 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b8de3d27d90a19dd09d9cb708ae955939ccc89d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ca483442df2ad58f2499ab667e4fe435e0169508fed23221e02618231adb85 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be4410f8f8784df78d5e008d82ee932e8f51fdaa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d316ffc88b7c3087d488ec9b0b66f55d478459a7c8d9627ea7e1479e640e8ef +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9578fc039e2b728d747d02b0caaccbdf17fae347 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac79909caa2f128bf738bf5e201d13ad4a3e545f1fd4135394df9518dd28ea68 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3043c5723ed13952cea6aa12e3d29a862d235520 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:493e4447686beccf07728b65cc6d1b13fc930094c8d6a60d70acf0411d904f35 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82d5f2c61fa6d024d1976c542aa87b4bee87690c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca8396e32d30ca7b538bf4b044dc5841b959e460a378d73d78f64ac1c7844b2 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..0e13e0563ec45a863d519305a1251d3e72b9e3e4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/latest @@ -0,0 +1 @@ +global_step120 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056328cf4dbfbdfaf5b7ffa668b29852f77a3798 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b76da7ccfd8d1a286433da6127628e0c6a1565950b2dea51fe5864ad3e6545 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..31902dc3a42fb37ee1316b1e43153f51c280e66b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 2.4, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 651972994138112.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aac416ad8d4b50eb484db01066dc0c8e252c92f6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c12b873fc9c704e76a1b9f83c2b368d9011f641e50ff8cb20847d850707ec1 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4a9026de437e7c9187232d8ef2d984502526746 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a7b11a0d8abdb4e745e4556ed0e403900e840adae268a5ee614faff853546b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e48822d5a6cd0d0fe522a04b93b5689ee2f033f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e059addb58e9230c4d2bddf2619507998b1f0e70fa388df7f47a04f7986919 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3538a9043b02efc2171da50d635d0fd0b9c8874f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1cb7d31b164decd459e6db4c21b55130c8ef4c54a957bf14353d07ff66f1ee9 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e80b8b422919198a12c1a761d34a31594d730c6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa09b6b92aac9922fadac33288a251e25a3d4e79ce43fe45583c8447bc717459 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86e7dcdc24532854a29f49962c16f863e5684664 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c4fa1978374a5e190091bbf2a4d6bb1a3b6bbe2ba4d9d4e288634332239475 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bac36cb96abbc452c2bc2d3ae9d168b51e954d4a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178134a8227750b022464e2da44fdf60d2a4d88f9df8c55a4cca99a8316ac5b4 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed8ae6197b7989e448fe8f4a0aa9ad6621bcfa84 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae23707a0105ef7b627906f515908635b2d525b15728c4c2cbc9548a05012398 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..880c3b994dfa5188fc3cea12f6fb1f3126724b8b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd859f7c80eba47e6afed088452faf4ce7d6b45d765e1cb4002acccef49ce381 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..556a2447dff299500829d6b07996405b5aa7ef7e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8db97381f0908046f9bfbb7d15408d7d57f7f8ae3ff2acb642fc60c91c5acc4b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d63c33b6beebfc35288a6b3959d2dec81c03109d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08e5f8f2504e31f12e5f1a8bd898d18835a04c1afde31d47dffd0064a376b086 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29165b01b8cf74a7b098ed230b9dc24091dc4c62 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ce0adda744ba5ce4586f31a72963d69143696af0be7ab81247c93ee06d71a5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df18699badc8a051ebac62dacecaaabcab780ed0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ebc38a52c4212f0c8f2610340bb32ec0a02f72a74a94e246e42d4e5c060a01 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddee5feec04fc29a3092865fd8d67afe8d2dddc1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14e58d4f2035e166121994f82a2faf987802d9b14dafeb2ff470abfb828bd25 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f22d48c8c5042ebcbfafed4dbbe239eb0e78464 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c4effdbfe2b03effdf04975026a1cb3f636a47a4b22642998e63b6f87cd1179 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5bc9448467ce35df91a3f5b31e919ebf07e5e03 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6840509dca0a783650cc30553d3c3475540fc870d6780d3a4bf7660a2116ba7b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eb44ee2645a92bb28936d7fbce53298cabf913a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db628821ba422c19bbedfd8669d18113a8e66d2a2dc90cf1702314ea54a94f00 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd2b9aef86529798137c2868d556e873a23c785c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/latest @@ -0,0 +1 @@ +global_step140 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6e613ad02e1482b1eef52ff51329fe67d4fceb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9c57c64e42f5d7ec5b6fd8bf14122cd4f49a4ae907dcde9c057b79cc82e639 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eeca8250677fd78cb387340963c48d9cc1fda9f3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/trainer_state.json @@ -0,0 +1,674 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 2.8, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 756732707471360.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-140/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..91d2d3bf32ae5572d5bd586463605f58bd3e8db4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9979000ec2c60e48e0bdc654f2f66835fa7435d77fc6d3617bea2d70defa8eab +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e059863c8f5bf5c057ec4c1645cc96d19443b4b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a1409ed56589059bbafb5914b44fac6893dfa0e90377ea194f91c85caf5841 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcbae5a0f5b7c5b87d1f6bcee93d36c0d9921301 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb51625e43571920115ed4faab0ce2b6dc06a596e1b13e952d6d2495d26b508d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ae33066cc35648ea043a69b4605b5e92692d124 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f15891fe39cf20ec1c83b53dcc4ff2a918ef88b979c5c7863fbfa0181e02c639 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79169425df4dce9950d2b1353220d4e576a608e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5986951ec6af46259d94827f7ddcfc86210b3f7704c32f7874c9716415cf8d92 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5bb4d6328a19d916319e031854e7fd4379e354b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd4cf560fcabb82b94f90c329ed58fa1576fc6e837247b7c1a2d3c64c801a968 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3af1a988c9a3bebd433baf56fd0f0b0e45754dc6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b99194437c11062c6a02936f0285d9ee4a91f4fef1ab3aad02618514fee417 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11019d43550ce6147b31680b4f6737aa9b39f0dd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0760a0d4792375764813e3943f5a24fe91abee0bda134cbfd6c1ea08ce02d253 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c030ded4ea7ab8237a2f2b7a02867f09b8efaf41 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4a3247a84e0d8eb058fce7dbeb7436f9a62cdcf684b791c84da4d1d4a95a43 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bb52e204936712e129cc4c08a639498a951b866 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac0a1ff002b7ea57be0111c20ae68ecceee98a9d236a4ac21e0c202f49b2d9 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2d07e780c1bb54bcf619b05c2df9890dd71884f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e4c6182b348c4107c85a615ee8ab650f16438b09d43847db76396d4e06c17a +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87ee5a07bcea5649bd44a7b7303cca9786048d1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5503c0901f4ed912ade9bcb7a3171e70a9cd00e24340eca823d4852a9fcd8cdc +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..078ae1035963b75da6d0a2a49210007be2f4341f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3305c3d11a96ed69f209f9daf4be77c2326880fb1f6f2cc97124c497244a45b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc62e5c3a4895c94b22bdb8eb8c2603b40f6d732 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de098f1b54f7348e1aad8356ceb37c1dd3f67f67c2f8ba01f449c578d60b7529 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da6a5dc0611317b1db4432319d2c4976e99b904f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f0ee0811acfabf3c6d8b85fb6d806ed703aef221ca46526291c246849aaad7 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a84a371b19317aa9dbf040ae16e79435b6ab252 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0927ab4c9f967ff57e6bc4646231e44857811342421721c657957d99b89b2874 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4f88c6a36b34c4b013e016d8ce35099bdb76a23 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5ac3d3a9dba2878ff573133f68b2914dfda959f871c882380eb0c01ee09786 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/latest new file mode 100644 index 0000000000000000000000000000000000000000..3df30ded267d950ff3ca04cffb9660be12079ca6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/latest @@ -0,0 +1 @@ +global_step160 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e31a2394e12bf431ae13288c3d90fe4727f07fa7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6462d333dbc5bb5e497ea9b0adb960f7616f79e6eea63222de6d5bd559516 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1db0a0f44aa3ac1d82c3bf8dc2d8968eeba4ce7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b045e1bfa728f51c8b51ab0faa20b128a4fbd350da006b9b39a19e24abdf5a74 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..75de18f57a056bd6a5f89df1abd045678f3f919e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76a3d058d2628a61848c2441d313f251278bd8f74ce43dc44d8cd8ad3e619a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fd100693bc9f3267d044ce4a16e702502dc03ec --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f72fc498e6eaa671cdc0e8a627a668b8ef607063a22ddb4edbc05e791be830 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aeeabfe119f1cb0c8c804f1b9a4d3049f478d69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12889af98e175b734a788f4c5b8c4da91dd61ff3a05aaf61b9d4c66aa3dd8ad6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fe0f42382ab06f4d26d753745a914c9e46100e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe21a86abfceeac2cf2f48afd61a9a506cf61a287f3403f1adf391bb2ffa5a83 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5830ca6bd04645962b6e56a00a91cd8349ca449c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73488bec91f9dee6d8105d06f99edaf4d27b6b064250d4c7023f33285b2f3132 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..343d1c0475f0dc64100dc67b09195e047f1a7bcf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf6ee1cc2e1325b428a21172ec4e61b7220c5489751ea11c06bb66c77a0cd08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..229789af83e72e748f236450e9d2df977318d98a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b659f5e1f39ab526587d47a9d305eeca96cdb1335d25ff0a7b9958f685604b4 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1b799abf11121d559f282f3042748b515a46f08 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/trainer_state.json @@ -0,0 +1,763 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 3.2, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 874778826637312.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-160/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c0ac878e2fffc8a11fc259e8dddb5b0e1f8ac01 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8232adc9e6c48cec0dd5c5ee96774cd56ed6db506246ec60439c2d427061449f +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8676d4632215a14cf235125649b685ce4eb8cb0b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a45a1b46fe2b42ed71be7ebf1bd508a2e872d4159bc928bc65ace086483329 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ade257d5f810b6dbbacf08321720d01044033c3e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32648cca2afd0dbcb41ef837f9c124ffc8eabbad692daa3feb30450533e1ff92 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..970430607e9eafa1b990b488916b16727efcba9d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35746b616611005f300a8c1bb71bdf73f3d0bebf18d89d3b9c2227768e137bc2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68a10ec25bed91c62138fe143746cb23de24d8f3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d316465016e0b87a17b406e2485e55b591a7c20fd1a38a950e7051eb2c3562 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cfe1bedc9231ce8db99cf4e7ddc2c4d8c4f2fc9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43592c2249c38a7e2303e56617a40cfca6ffb5458cb621ed8bdc674ee5847cb2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c11d66e91555b81171e1004eec7cfc83d36abc1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a287bf316387c760c5cce97c4e318c0edab85a6969a2cd08d22050f6dd66421 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38fc06f00eaf5516bca68f40223abd709b1bce54 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b9a7302c09bf1f025c15774bbc770032326f1d81d8e3cfe3377ec667247081 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d64f5cc9ae9ee605bf016e1e7e34ac67a34f1289 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b653ce1e2e723c4687e383d55c7bc86c10cdde02aa881846688002bf9aefd341 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2932a8773c16b6882e1419d8469957df90af1997 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18181ab077dda4474164b6d499e3f674797f9b6b82cadcb04aedb31aab88d290 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d78a85a66ef239c8b3fb8a531c558de71e9c72b2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:270207ddc5f40f1c24ac22b40b5693f65c3476ded9e9e656f54628f211221bbf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f6c30dca8b331694f63d796f11ef8b636eb3216 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f6985acc97513eb6d5e31edb03929f8373ca142aa960eeba2518ec6deffaee4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4808a28899a3d1edf255d9b2f1d9b1807a045730 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a1b45cdbb1e21d261f12f8c1ed8dd9f08ac2fa49f2344f97d1b28056e59bef7 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e13e050e5de104e653fd50cbe84dd81c07982fe --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c785f1969585fdf1c0a111687ec9da930fbaecbf76efb9d1a48e31e8feb8bdf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..756f0b04cd2a8c39cc5fe5c5da49513d6b6ec9c8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70365c55ca295b1ad744259f1d077790d9a5e113116b01c8c2e36e8ff78964c8 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da9049faa15e3b40ff4863f9162b59729aeb0d97 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11403d6dd013ec425d5d5997d4819527449778b16ed1b551419e7374f0c3e50c +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f51cd9c27d4398c68db9cb5a4b59ddd0cb747112 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4a0e692d2fb2d17a829f3b429299bf9bbf4d690eba4ec8854cf7b9cb26489e4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/latest new file mode 100644 index 0000000000000000000000000000000000000000..eac7d625396c2750025575c77b8da5d622b0c7dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/latest @@ -0,0 +1 @@ +global_step180 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f51b498d48145bd9cc14b35f8236b9ec95a4f7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08e59ac81067b262a084604cd3392250166c2841 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20a24c17b4be2ee59cd5e6682010519318a91e58 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..54050f6cf8fb847e2a926e14a7aad2647761521a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..263aae475c49b090bce43f143308192c5bf9a95b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..942ed5d60ae87dce686b33da76a34db404036dc6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..57789be3df3983cb8acc1500bf6470ffadb1c578 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32d6e2e7eb7148713b473b0c821a98e616ab6e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18942cfbbbc36710e196a20b862a745c9dcc2468 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fa6cf7ac608af8ab72180ce60dcfa61b0bf4eeab8e185f70f65a95b45e6b7a +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e29d21e78d1d9b4eadd1598599b722fbcf177ffd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/trainer_state.json @@ -0,0 +1,852 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 3.6, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.6366379388596981, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.6796875, + "logits/rejected": -0.21875, + "logps/chosen": -118.5, + "logps/rejected": -456.0, + "loss": 0.2604236602783203, + "memory(GiB)": 77.37, + "nll_loss": 0.10009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 19.375, + "rewards/rejected": -4.53125, + "step": 165, + "train_speed(iter/s)": 0.064349 + }, + { + "epoch": 3.4, + "grad_norm": 0.4478398792618674, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.22265625, + "logits/rejected": -0.62109375, + "logps/chosen": -458.0, + "logps/rejected": -169.0, + "loss": 0.22490353584289552, + "memory(GiB)": 77.37, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.25, + "rewards/margins": 25.5, + "rewards/rejected": 1.8203125, + "step": 170, + "train_speed(iter/s)": 0.064149 + }, + { + "epoch": 3.5, + "grad_norm": 0.43618775126579723, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.5625, + "logits/rejected": 0.06494140625, + "logps/chosen": -254.0, + "logps/rejected": -452.0, + "loss": 0.24174847602844238, + "memory(GiB)": 77.37, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.625, + "rewards/margins": 20.625, + "rewards/rejected": 4.9375, + "step": 175, + "train_speed(iter/s)": 0.064217 + }, + { + "epoch": 3.6, + "grad_norm": 0.328809465093868, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.251953125, + "logits/rejected": -0.640625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.30023603439331054, + "memory(GiB)": 77.37, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 25.375, + "rewards/rejected": 0.8359375, + "step": 180, + "train_speed(iter/s)": 0.064542 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -284.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.90625, + "eval_runtime": 7.0704, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.141, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 984566522707968.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-180/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c83d64de1ba350d99a517262c145d37496f08fed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322797fb885a660effa4091be306bc1ced32a9555fde1e27de6b7852ecb29659 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72fdfdad16482b33179a1eb884f298fb1dcad294 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf342587c03779c234939383c7f524311c07bc5f1082f5390bb0b27b35f9c3bf +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e2a686b8511ecd5eafa27e9fc457263661eb531 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c314699608b13111b750a8fe97b4c3110e8fa3e39561ad637778ccf55dc4ce3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff896cf449d0c226455e0da39b076769fcee06f0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c3bb3d4af8dcd0bc14f4b34e5ef3ea7e611c976dd3065d1c47742c7751e6d9 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a4f1cbd57e424cba848447992abea9012de566d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb643f63c4138b8ce34ec6096861bd54350d9e715fe53f1433d6a8b98a007d0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd0f6d080ba1a782e23f45db7de4818cffe2aedb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058cc9baea9f285491f190fce6d7237f894d056a025ee3ddaea1f1902df6aaa1 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbdc2c8c4a19be9fc0514e7e1e4e0a0a90ea63ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f90dce46590d8da56c2b97c22689ebe84c30ecb2f52aefd65b1dd8a9b1aac708 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb363cf0fc41175a83a3d034487c2b1f023f566c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c855880dd47f638705cb7f80f751269c32f73ed2c4d9fac7c9eb918bd38c60f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f23629c093a29c1b2f8d131ab74950d10f894af1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcfbf7e47f96adafce495deeaf3f7797cbe9e6e99c2666155fb6b20a9c32480 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8eaf7ccf268dfb240bfd65263abf79405ba5bb2b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6dbf78c01cc387486e57c3c52abba1ad23ed5241f67411abbc3ab6b91a4584 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79c09e4417f6875a7f2cce5411840d131b2fb94c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daab2258b699a674b288b6764ed890eb58482a9167bb6b4c96d0d32e0ce12b28 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..098562d1db99b6738150c2fe4f67439574b4ef69 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac2ac815b4252f4e39fc34f5595c625aa8c39f27f95204cb1fa0a96a126538b +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1f8e0e6652e57269cce59ab041057ef92c68a1c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a2cac023e2b2b8375b4aa8d456b2c9bfbb90a11ba27250c5951f5b0e9170e8 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e26ac12fdb1d0f4623900ededde1c22feb58f181 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db4f603e6de91f3092db4288f8470eb4a06cb0e7543ada42d10a8355fd3e65a +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddc87a4ae6e8e07cd3c9cdbed98d9d556a6345ac --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:010816564ecc7ec7910e2be92d0891b318b6b92e650476d60320f079bc2e4f37 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48498d137683134c3f096f8910da02b41d84e14c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c8096a7800cd6276ab955027401cd0fb8e708ba4ba83d843702759c53d98a92 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d40b8e07ce49da2d392a140a748830f0a6d23eb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8299a3869a7547d8b5bc3f3db39bb8df28f44685cfc0290eae514c12e006996 +size 1107782 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2729ff9a97436d6c8ad743637f529065140ad3f1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e1cf73eea4791075e839e628da180bf39e1e01fcc1630f4ac9c723d8793968 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc00e22021763c9e302737dcab5aa35e0b75cbda --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.49023438, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20", + "epoch": 0.4, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 111559537328128.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..071d33d7ef2fe3005b1997f12c78262aa4f54a4e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4ad9c68ea3cdce30c6d6cc01b891ed0eac4e368e663ea188e58b9ba935fa2b +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0163f223be2c225e9d521f81b037d42507fde99 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecddedd288d71b7bb2ed713a9ad3aa099b5f56132f9e975916481fcbe1c159fc +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41e436fea9fb8cbeadf9be9f69e6fa38cb896464 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9830ac0932621ae7a2bfb682dc1e35ad352cf48e440cc67f193ed022de3b152 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e96caa79b2f812c01fe1fb96bfcab0cfcf819dae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:756612cb33b0db80eb45c23cd944546a38272047de113e21d6027e1b23bf39b7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a6637a4833f5a6349519af5061cee069afa6e6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30813776235aef397eb3e5bfad9a53e7887674ae47a2fc9f100b4417b7aafa2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43dddcb5b3d1b79964eec38843b586b66408facc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6a8ace2938357a232af14d4d961e3d72c319d39ecea26a4f6a22266e559eb45 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd61ccf74724569d7b7695eace6af123407b986 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2124c3faed4c97f74d2fb934a4ec215fe4383527a401da7f62dd33c9fdb232c7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32d753eb57db0fc94e46e4ee6def44bac8eafa88 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41f8afda33c980f08c3969f66bde7bcb8fbd05ec64e56a2d182fdadd2dd505f1 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..313cfed78a3cf91fc124673eb2aede57a5b78602 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:265ca96bb198dd9606968fa6a0fd904ae7b9733c90c2de231d5c2c8ca8e1e7eb +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9386fa2a8228b2e5f2e4395e10e2712eeb7dcdb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede5c98e820d7441d5ea4dc7e02076da63a510c8165dd959015e40448b83505e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0b3f9c11b7795d3bb38bd8b60762a5029a231dd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce39ac0edfd96a8e0cb4bed9cad9832ed0b916bdd0ec96eb162e04b948c98b8a +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6348897346ea645c55ee5e7f01f60343ee1e492 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58808634b5798ff1df1b6e3d83a8f80e7e95f65c061b6290a2240374354178b1 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b931de2c3b78efb6a65d776fcd09849be9c1e333 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1edd4597ea0bf90f18b4f2b33792ec5bfd7bd196d316203dd88333b6fad63e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fd64dcc861c64b094e2a78e4d556a3c8da2f0ab --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8efaf9ed02434e43945d9f7f55744d5d303e81c9aaf7d54e3fed025d668fee80 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..133c49fd0718a399dce3ea2825985a5238963e3f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acef0c26c4467202cc5e26ed2b86fb1aee58884addcca6a35145d04b1514139b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..215a238fa5e89d31c06bed08c3b76b5be797c446 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f382e120462df0af9d45c97312af7f6df26e2b4c9c44a32ec78707f2ff9d2e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70716ec87cf9810cfa1dedbd709a42e3eb31fb60 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683e06a81dc0cb8a7b54f36fb551c94aac8fe90fbe58946edc7288f10650f1ee +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/latest new file mode 100644 index 0000000000000000000000000000000000000000..753e24e10f3a2489150f458205cf759fd8b6081f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..584f4a4a43f100f35696d7314a633631af587f25 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891ffa7c7dae99113aa986d67278b52b8c57db55001dc3547a61f24569a34ee +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..05b027a867e5e9cebd446293ecff82cfb240cc76 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b92875cb04deec367605433847d1bda444b178b643d2da7ed9aaf738d232b4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..af98f0dfe2a5d89fbccf90df58246a0b078c7016 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f5f3338a05e325b5408a1cd0b6f5e5b10fad05fe479d63f44bec4cf18107d6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..715aa4a4ee3915f810fc2bacb2153eb8a0913781 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be749fea477a3867d44010631937e0d8f071ca5f9614f9795c92c7fa68833a6 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bde70899833455b6ee4a99aff9388abc5ffe92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc4a5ea4532c621f4c8e9891117b2e597a7f005001e8b4f2a1b4da8c82bf964 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..90cdeaa2fe438098e9d95ddbc06c765e51af1e78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480f9fe7dd71b54d915b46162e34b780ba2467d5542115cc809dbca60b394c0e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd30529614c5be239cd9477af6bef0e313740b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11d982dcd813e82c2d97a5491ce9624cff2dd22e8655ea617ccef1fc1474470 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bed311094effd49cc2c89237c675f56eade157d1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73494fac3a001cba7cedd097b97f028d4c1d136ee6709214b0a7fe305e5b9089 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b08896e3e64039017a0606b43a6327f1f78848dc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826281cb7f404c3805b9798147d05074dd208eac748e2052087055a015aaeaed +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a487d8c97f227277c9ce3c0aef8ffa0780a8f5aa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/trainer_state.json @@ -0,0 +1,941 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 4.0, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.6366379388596981, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.6796875, + "logits/rejected": -0.21875, + "logps/chosen": -118.5, + "logps/rejected": -456.0, + "loss": 0.2604236602783203, + "memory(GiB)": 77.37, + "nll_loss": 0.10009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 19.375, + "rewards/rejected": -4.53125, + "step": 165, + "train_speed(iter/s)": 0.064349 + }, + { + "epoch": 3.4, + "grad_norm": 0.4478398792618674, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.22265625, + "logits/rejected": -0.62109375, + "logps/chosen": -458.0, + "logps/rejected": -169.0, + "loss": 0.22490353584289552, + "memory(GiB)": 77.37, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.25, + "rewards/margins": 25.5, + "rewards/rejected": 1.8203125, + "step": 170, + "train_speed(iter/s)": 0.064149 + }, + { + "epoch": 3.5, + "grad_norm": 0.43618775126579723, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.5625, + "logits/rejected": 0.06494140625, + "logps/chosen": -254.0, + "logps/rejected": -452.0, + "loss": 0.24174847602844238, + "memory(GiB)": 77.37, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.625, + "rewards/margins": 20.625, + "rewards/rejected": 4.9375, + "step": 175, + "train_speed(iter/s)": 0.064217 + }, + { + "epoch": 3.6, + "grad_norm": 0.328809465093868, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.251953125, + "logits/rejected": -0.640625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.30023603439331054, + "memory(GiB)": 77.37, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 25.375, + "rewards/rejected": 0.8359375, + "step": 180, + "train_speed(iter/s)": 0.064542 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -284.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.90625, + "eval_runtime": 7.0704, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.141, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.49681405036162596, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.09375, + "logits/rejected": -0.45703125, + "logps/chosen": -358.0, + "logps/rejected": -195.0, + "loss": 0.25345821380615235, + "memory(GiB)": 77.37, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 22.375, + "rewards/rejected": 2.671875, + "step": 185, + "train_speed(iter/s)": 0.064516 + }, + { + "epoch": 3.8, + "grad_norm": 0.5063889084098974, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.60546875, + "logits/rejected": -0.2734375, + "logps/chosen": -183.0, + "logps/rejected": -544.0, + "loss": 0.2541311264038086, + "memory(GiB)": 77.37, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.875, + "rewards/margins": 25.25, + "rewards/rejected": -0.404296875, + "step": 190, + "train_speed(iter/s)": 0.064791 + }, + { + "epoch": 3.9, + "grad_norm": 0.5269282280846967, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.435546875, + "logits/rejected": -0.10546875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.21763362884521484, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 22.5, + "rewards/rejected": 0.051513671875, + "step": 195, + "train_speed(iter/s)": 0.06463 + }, + { + "epoch": 4.0, + "grad_norm": 0.30158893716839064, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.0654296875, + "logits/rejected": -0.056884765625, + "logps/chosen": -235.0, + "logps/rejected": -568.0, + "loss": 0.24068713188171387, + "memory(GiB)": 77.37, + "nll_loss": 0.25390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.625, + "rewards/rejected": 0.296875, + "step": 200, + "train_speed(iter/s)": 0.064458 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.19921875, + "eval_logps/chosen": -7.90625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.376953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0218, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1092251805024256.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c3e07b20e0b60793df4262dacea3e104a0d3c72d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8dcbb032f883d4648037cc552bf487f27d9403a353badc0af5cfe71e28ebbc7 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..598b6c7ada74091b1d73316800fc68a17278128a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f804fd12866d8f54c3299df8a42eb7093e5f9e6348f747d84f66acfb4058e7f7 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26580adb24b88fd0ba77bd96a4f14ca6ba8fd9a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a96bbce8e515472e4339c93aac8d4d281d8cdca376954f3cbed8ed6b5e3e7f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00e4d794fa7e9a67a16d13742ccdd20f347db277 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da40ce5c45f1585f091d1690b347966ca0395f03bfd3a5ba9fe08a5943ed9be3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4abb638837d929bc2a54b5001fa220dcb6f07b2a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36aa3443601bf2c3688e54f9426f408d1b88e7d33182c701d0223ebe73a54bd8 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4421d9676f3fcf4ab9170515128d2783a57361d6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c53022e07bc2fcb2459ce1e0445d704effa3336d45d5f39494b0a5ba2808f6e0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c7a9f2a043855691e2cc0f00e87efb20a78b833 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e19c74f7c725efb1b632cb0e074013927a3408b576c2c0e7cf21ef9ea01ee3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ad87ca1c1175d2753453a871d2fa35ee37b4049 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f5fa1863f3ddc52f91f147597e66894a0b99931265fb45a9134a35fc16356 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5971119e19044172c752eb5259d9ea1f81a3b2d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e5e789a7db143c23a3c6f9d2cca1b0a74b9ac681c42c8f8d564420c10d79362 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4001549dead1ad26881eed12344ad9f1b53afd92 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dd8000652c140749c5d6a4228b4d83ab9fe446e45de13a198d258708b2b462d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d5899a85b9dfc8985f1f1ed3f2cbe702e73199 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffc2892dce57c0f0d33351b690a26abda9a93340adf054389475f1509d26df27 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbb56aeee3902fa6f032d487c5fd80fa68cdde29 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3096335b50ddd64e5cce779bb5a97f0338ebdd21b257d2af39d5b05c2e2146aa +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..299b48d0d16e2cb23a339c9c893115cdd75aba04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58ffc0b6942fff690685bf2ff03a20bfdcbc28a4a9dbeeafa0f468b8f82ae1d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ed8d5a05a049d1414a980e87cfd299824d0d2ea --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c05b153dcc2f38676593b081f8aee78880062670223507c85bbd0652f9a9e7b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe4d103dc0e50de5aa55879cb48986750b27b95 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987227f69464208f7f08217faaa9f524add2569c17dedb0375525e984ae0288b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0291f13e5a367e81a2c11056064b812a52d28295 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc96c452dafce2d8540f8199af19d91f057b9fa0abe7db28f7cea2b571dd391 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a197e45872e38a31da2ef0859424e206f8ee65c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4b35799b615e524accf610c7d04133c12a4409d1be01c05ec7a60d96d9a18d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9ebe2709e7f014a6431e10a08b9ee83756b9b83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/latest @@ -0,0 +1 @@ +global_step220 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc26f1e85f4e8e85881b70bb37705b907a71e2da --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192b6eaac6b92a2de7d039b2fc8b1f373bff6953e1e6a952189b56167078edd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..03230bbf28b3fb985e4af72af45587ec8ada4e7e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/trainer_state.json @@ -0,0 +1,1030 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 4.4, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.6366379388596981, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.6796875, + "logits/rejected": -0.21875, + "logps/chosen": -118.5, + "logps/rejected": -456.0, + "loss": 0.2604236602783203, + "memory(GiB)": 77.37, + "nll_loss": 0.10009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 19.375, + "rewards/rejected": -4.53125, + "step": 165, + "train_speed(iter/s)": 0.064349 + }, + { + "epoch": 3.4, + "grad_norm": 0.4478398792618674, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.22265625, + "logits/rejected": -0.62109375, + "logps/chosen": -458.0, + "logps/rejected": -169.0, + "loss": 0.22490353584289552, + "memory(GiB)": 77.37, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.25, + "rewards/margins": 25.5, + "rewards/rejected": 1.8203125, + "step": 170, + "train_speed(iter/s)": 0.064149 + }, + { + "epoch": 3.5, + "grad_norm": 0.43618775126579723, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.5625, + "logits/rejected": 0.06494140625, + "logps/chosen": -254.0, + "logps/rejected": -452.0, + "loss": 0.24174847602844238, + "memory(GiB)": 77.37, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.625, + "rewards/margins": 20.625, + "rewards/rejected": 4.9375, + "step": 175, + "train_speed(iter/s)": 0.064217 + }, + { + "epoch": 3.6, + "grad_norm": 0.328809465093868, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.251953125, + "logits/rejected": -0.640625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.30023603439331054, + "memory(GiB)": 77.37, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 25.375, + "rewards/rejected": 0.8359375, + "step": 180, + "train_speed(iter/s)": 0.064542 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -284.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.90625, + "eval_runtime": 7.0704, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.141, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.49681405036162596, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.09375, + "logits/rejected": -0.45703125, + "logps/chosen": -358.0, + "logps/rejected": -195.0, + "loss": 0.25345821380615235, + "memory(GiB)": 77.37, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 22.375, + "rewards/rejected": 2.671875, + "step": 185, + "train_speed(iter/s)": 0.064516 + }, + { + "epoch": 3.8, + "grad_norm": 0.5063889084098974, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.60546875, + "logits/rejected": -0.2734375, + "logps/chosen": -183.0, + "logps/rejected": -544.0, + "loss": 0.2541311264038086, + "memory(GiB)": 77.37, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.875, + "rewards/margins": 25.25, + "rewards/rejected": -0.404296875, + "step": 190, + "train_speed(iter/s)": 0.064791 + }, + { + "epoch": 3.9, + "grad_norm": 0.5269282280846967, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.435546875, + "logits/rejected": -0.10546875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.21763362884521484, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 22.5, + "rewards/rejected": 0.051513671875, + "step": 195, + "train_speed(iter/s)": 0.06463 + }, + { + "epoch": 4.0, + "grad_norm": 0.30158893716839064, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.0654296875, + "logits/rejected": -0.056884765625, + "logps/chosen": -235.0, + "logps/rejected": -568.0, + "loss": 0.24068713188171387, + "memory(GiB)": 77.37, + "nll_loss": 0.25390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.625, + "rewards/rejected": 0.296875, + "step": 200, + "train_speed(iter/s)": 0.064458 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.19921875, + "eval_logps/chosen": -7.90625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.376953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0218, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.5197111532124594, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.17578125, + "logits/rejected": -0.1318359375, + "logps/chosen": -270.0, + "logps/rejected": -556.0, + "loss": 0.2583838939666748, + "memory(GiB)": 77.37, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.125, + "rewards/margins": 19.125, + "rewards/rejected": 4.0625, + "step": 205, + "train_speed(iter/s)": 0.064077 + }, + { + "epoch": 4.2, + "grad_norm": 0.6236218136792746, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.2294921875, + "logits/rejected": -0.48828125, + "logps/chosen": -184.0, + "logps/rejected": -220.0, + "loss": 0.2005645751953125, + "memory(GiB)": 77.37, + "nll_loss": 0.208984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.75, + "rewards/margins": 24.0, + "rewards/rejected": 1.8515625, + "step": 210, + "train_speed(iter/s)": 0.064191 + }, + { + "epoch": 4.3, + "grad_norm": 0.47824031212563534, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.494140625, + "logits/rejected": 0.1650390625, + "logps/chosen": -117.0, + "logps/rejected": -972.0, + "loss": 0.23559434413909913, + "memory(GiB)": 77.37, + "nll_loss": 0.1142578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 21.625, + "rewards/rejected": -2.890625, + "step": 215, + "train_speed(iter/s)": 0.064213 + }, + { + "epoch": 4.4, + "grad_norm": 0.3885497874914557, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.5625, + "logits/rejected": -0.08203125, + "logps/chosen": -70.5, + "logps/rejected": -864.0, + "loss": 0.21706581115722656, + "memory(GiB)": 77.37, + "nll_loss": 0.052978515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 19.625, + "rewards/rejected": -3.921875, + "step": 220, + "train_speed(iter/s)": 0.064109 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.20703125, + "eval_logps/chosen": -8.25, + "eval_logps/rejected": -286.0, + "eval_loss": 0.44921875, + "eval_nll_loss": 0.392578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.625, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0379, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 220 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1206453557264384.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-220/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aa5f3dc607f8202243e1843c8c12373b5b510e28 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4f1c8288ade5f10106e0271a01ea584e41c18410c327fdc9f8bbee5303a783 +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b8e2b432d2e8b01b2d5c1ffe982dc3494f5c4ef --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb39892e5a1dee5386cd754d03829ae472978b6bdaa9de92e033ab1b796b678 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45f8bed4c0c29e54f02e68fa63f5be74d27a154a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14349651371deab47ae5f3a23010593b21f34c2bbca9f12cb6d165bc0239bf6d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..035822770efd4a67649f2afbb73ee820bcb8a6e5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d816076c466f948014f95fe87412bd268b6f009488fb108cdf61d7e4b1021ef2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb126d62ec431f64de9ebfbd5f20527cd9186ed8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3327a51cde564088d3304d8863503023fe44ceefa43fa7598dc3b874b45997d0 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..269d2d0b31a5c65522834500df8a1b54231d6908 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62a63861dcbd5e714c43df4fcae0a95073ce4b83d65cd985e9fa71e282db3ac +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e4b92853e2c507953f95a74d345089334026a37 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf3907e3293c039d736f8323fffe8a0f60de7b8b449241af8e8b79f9e4963db +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8331ff5c259f5739ff9f8fe54b987f0c4da26d20 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8b79e6d3aaad4dee9250da358fb4497de3d8ef5d899edc7e3e4b525cf8f465 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b7ae61fbb4da3e0a0a9b6dcb206b32411d66d18 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f89ad9bf5d4b349e88cccbf04ed3dac8001fed6b4c073af0b7f0640f217716 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c5c4048eb1d529dcbe9e99a6121a38a8a507d8a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74f4291b39ea877bccddef328c62959bc2c2d769cb7381a9619d92618bc8e859 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7eccdf3e4244f8b654805d86ab9502dc4f5eec7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fbfbc2df12cd68b4a45ee00b2f35378e753888cff8d2fee66de1b21c09fe2d5 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..240734631e319f63014c88fc4e9de37a330ea7a1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34431ad9bf9fb39759581f279df075ede3570a4802b8cc487b1b5c04f0a99043 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afc557da3be8f3f6fe56dba58de417f9950953b8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8db6b64587b82e05310f6dea6f26f7c15db23a59e9496299490d039ca34322 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01db5ad49a094ffeaf5918b3e5fe72b658f67eeb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5634fb653619f31b3d2e26f7ca93bc49241f9b84514606f7f3049ee29c2c0719 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5d32ba8c6b706369cfa684ef45fdad23dfd878a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4761092fb1d9eb6cef1b0067e673037ad51671b4f1c043af77c6d001732903f6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee7ff4a2fb36ae046b8eec0df9c4777f67a4b672 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f334d97e640c681248521d4af909c1d4a8fa8e9c7852f6a8eb53efebc21a623e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..969cee9b67c8455d4060c64d7775f5bf32de11d2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9374191d6ab47cedb4bb518431d0d2d1848b693b4bba79f4a0da99f90bd3124 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/latest new file mode 100644 index 0000000000000000000000000000000000000000..161e63cf7292b2184098d115f0621d2ed09e86c5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/latest @@ -0,0 +1 @@ +global_step240 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3a6ea45dd4e59b9683f66476f460fa0c77a9d66 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0c9979566a5d89cb3c766336548670ec6f2291deba1b7ab1764c12d3187b24 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42e6b0d6985c9b3f0cec701759e0b3d671c77abd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e36a570d6158fc25d1cf5d9f8f450fc64c5a7683330277f89ff76d5f2fc6cd +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..376994a32199299a2a48b62753947cdb1f7ad72a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f619cbef4b74f1680d667c8788285a602392e63bdf3760ef3a59ec8864d483 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f1edb2dfec55e5cbead7ae3d14351c3650c4f77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc037fba93ace1bf7ce01b1a5f7d785698d47b4cc2cedf2300bbf7a41ebf05c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..016d34db4ec6597c207021d026234c9692c3f3ad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab728c2461d6d1c64f04d7cbfdfcbfa7bd7ad0ef6e19d52458501ee81b27128 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d7824c2bd9e8b1cec7f0d84d673017b0da62e43 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27530e653ebf5997ae3159cdcde264607e6a6f86b7e3c7a1b3a1e8301cd43d03 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f41ee261ad98d2d0eb8f09167a5b32604513b56 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fddaeb1257697bd7c0101abf1ab23f2925d0d9165cd8bddfbd22f8444db2b7 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8816834cc1c0e822e11a8df138fa41557f3a0fb --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942af3734a320fe12a3205a47ca1cdc7d1f0996bfde86c020a35545ccd2fd418 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce5faf9896aeadd65d47acddb4b510a6fc3c65f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a46b33bfe1e26ebea81904070b93f8e7376ae49add370042b1998521eed8ba +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..05c4d5ebd8492b725347c765e8456cc561779cff --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/trainer_state.json @@ -0,0 +1,1119 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 4.8, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.6366379388596981, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.6796875, + "logits/rejected": -0.21875, + "logps/chosen": -118.5, + "logps/rejected": -456.0, + "loss": 0.2604236602783203, + "memory(GiB)": 77.37, + "nll_loss": 0.10009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 19.375, + "rewards/rejected": -4.53125, + "step": 165, + "train_speed(iter/s)": 0.064349 + }, + { + "epoch": 3.4, + "grad_norm": 0.4478398792618674, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.22265625, + "logits/rejected": -0.62109375, + "logps/chosen": -458.0, + "logps/rejected": -169.0, + "loss": 0.22490353584289552, + "memory(GiB)": 77.37, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.25, + "rewards/margins": 25.5, + "rewards/rejected": 1.8203125, + "step": 170, + "train_speed(iter/s)": 0.064149 + }, + { + "epoch": 3.5, + "grad_norm": 0.43618775126579723, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.5625, + "logits/rejected": 0.06494140625, + "logps/chosen": -254.0, + "logps/rejected": -452.0, + "loss": 0.24174847602844238, + "memory(GiB)": 77.37, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.625, + "rewards/margins": 20.625, + "rewards/rejected": 4.9375, + "step": 175, + "train_speed(iter/s)": 0.064217 + }, + { + "epoch": 3.6, + "grad_norm": 0.328809465093868, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.251953125, + "logits/rejected": -0.640625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.30023603439331054, + "memory(GiB)": 77.37, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 25.375, + "rewards/rejected": 0.8359375, + "step": 180, + "train_speed(iter/s)": 0.064542 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -284.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.90625, + "eval_runtime": 7.0704, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.141, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.49681405036162596, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.09375, + "logits/rejected": -0.45703125, + "logps/chosen": -358.0, + "logps/rejected": -195.0, + "loss": 0.25345821380615235, + "memory(GiB)": 77.37, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 22.375, + "rewards/rejected": 2.671875, + "step": 185, + "train_speed(iter/s)": 0.064516 + }, + { + "epoch": 3.8, + "grad_norm": 0.5063889084098974, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.60546875, + "logits/rejected": -0.2734375, + "logps/chosen": -183.0, + "logps/rejected": -544.0, + "loss": 0.2541311264038086, + "memory(GiB)": 77.37, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.875, + "rewards/margins": 25.25, + "rewards/rejected": -0.404296875, + "step": 190, + "train_speed(iter/s)": 0.064791 + }, + { + "epoch": 3.9, + "grad_norm": 0.5269282280846967, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.435546875, + "logits/rejected": -0.10546875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.21763362884521484, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 22.5, + "rewards/rejected": 0.051513671875, + "step": 195, + "train_speed(iter/s)": 0.06463 + }, + { + "epoch": 4.0, + "grad_norm": 0.30158893716839064, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.0654296875, + "logits/rejected": -0.056884765625, + "logps/chosen": -235.0, + "logps/rejected": -568.0, + "loss": 0.24068713188171387, + "memory(GiB)": 77.37, + "nll_loss": 0.25390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.625, + "rewards/rejected": 0.296875, + "step": 200, + "train_speed(iter/s)": 0.064458 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.19921875, + "eval_logps/chosen": -7.90625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.376953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0218, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.5197111532124594, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.17578125, + "logits/rejected": -0.1318359375, + "logps/chosen": -270.0, + "logps/rejected": -556.0, + "loss": 0.2583838939666748, + "memory(GiB)": 77.37, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.125, + "rewards/margins": 19.125, + "rewards/rejected": 4.0625, + "step": 205, + "train_speed(iter/s)": 0.064077 + }, + { + "epoch": 4.2, + "grad_norm": 0.6236218136792746, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.2294921875, + "logits/rejected": -0.48828125, + "logps/chosen": -184.0, + "logps/rejected": -220.0, + "loss": 0.2005645751953125, + "memory(GiB)": 77.37, + "nll_loss": 0.208984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.75, + "rewards/margins": 24.0, + "rewards/rejected": 1.8515625, + "step": 210, + "train_speed(iter/s)": 0.064191 + }, + { + "epoch": 4.3, + "grad_norm": 0.47824031212563534, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.494140625, + "logits/rejected": 0.1650390625, + "logps/chosen": -117.0, + "logps/rejected": -972.0, + "loss": 0.23559434413909913, + "memory(GiB)": 77.37, + "nll_loss": 0.1142578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 21.625, + "rewards/rejected": -2.890625, + "step": 215, + "train_speed(iter/s)": 0.064213 + }, + { + "epoch": 4.4, + "grad_norm": 0.3885497874914557, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.5625, + "logits/rejected": -0.08203125, + "logps/chosen": -70.5, + "logps/rejected": -864.0, + "loss": 0.21706581115722656, + "memory(GiB)": 77.37, + "nll_loss": 0.052978515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 19.625, + "rewards/rejected": -3.921875, + "step": 220, + "train_speed(iter/s)": 0.064109 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.20703125, + "eval_logps/chosen": -8.25, + "eval_logps/rejected": -286.0, + "eval_loss": 0.44921875, + "eval_nll_loss": 0.392578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.625, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0379, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.38319446798926815, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -0.12890625, + "logits/rejected": -0.625, + "logps/chosen": -470.0, + "logps/rejected": -201.0, + "loss": 0.23574182987213135, + "memory(GiB)": 77.37, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 32.25, + "rewards/margins": 27.125, + "rewards/rejected": 5.15625, + "step": 225, + "train_speed(iter/s)": 0.063935 + }, + { + "epoch": 4.6, + "grad_norm": 0.4331863840312884, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.028076171875, + "logits/rejected": -0.5234375, + "logps/chosen": -208.0, + "logps/rejected": -676.0, + "loss": 0.2071385383605957, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 23.125, + "rewards/rejected": 1.828125, + "step": 230, + "train_speed(iter/s)": 0.063909 + }, + { + "epoch": 4.7, + "grad_norm": 0.7273350214373521, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -0.58984375, + "logits/rejected": -0.078125, + "logps/chosen": -151.0, + "logps/rejected": -624.0, + "loss": 0.1720048427581787, + "memory(GiB)": 77.37, + "nll_loss": 0.1806640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 21.375, + "rewards/rejected": -2.609375, + "step": 235, + "train_speed(iter/s)": 0.06399 + }, + { + "epoch": 4.8, + "grad_norm": 0.32144115051135386, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -0.384765625, + "logits/rejected": -0.5234375, + "logps/chosen": -304.0, + "logps/rejected": -556.0, + "loss": 0.20636966228485107, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 28.75, + "rewards/margins": 24.375, + "rewards/rejected": 4.34375, + "step": 240, + "train_speed(iter/s)": 0.064197 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -0.98046875, + "eval_logits/rejected": -0.20703125, + "eval_logps/chosen": -8.4375, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4521484375, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.4375, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0748, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1312899030581248.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-240/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e3c444b6aa39a1b153e1145fb546131f907a6169 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3189b6c7c215fbc84ab6449a4b0c337d14703e891e40afd0a0ea809833c4e6b +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d1e8029a9760fff05618f95f3b59164df30d161 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555e96ba2eb9c50199aca1422fd6f5a01f75e8ca8851be771ac39f8568587e81 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fbffc03492601e58e06bf94508d3910c853076f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ec61cb794a04925c9b40598bd84f71c4d53a1b2186b0780d50dbe9a30637a9 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1299c4f3d725d25b5ddf12771daf76956dbb9327 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d42c65026fce3b056a2b1d05f1e64f6b7863ea4e3cdf82ef6d3b5c16c5438c40 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bca3f37459714ec0e8bdd63ca2ea85c96dfa81bf --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b246095cc347df86d07d31707c384830881b09bcfbc292ebe553389ada8d19 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53659235bcdf653db1b086bd4f13608998d4452d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4155af3640b110637f3f7cd99ea71d0c7fc23bca18e9bc6b2b335f59ad554a2 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc302df9afa7e12c3de8ef9fd00193416f433ac6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d622c2257c972820d526d4d719dc3c2bf12121c58e02bb697843574ef41560b +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6da360d517057a2a10fc34d60eded6a3bed6ded --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94a09cf6373baa7bdab468d08b73469aa5fa0a3dbb56a618b5762658eb04e9f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..497af7db5fed166ddd779054a8861fe4c169c97e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6850ed8af0ad876fdaa6f79b3a39d60e5914ee6fbb8e08ef17544742c4b96fa +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6694a24fdc3c4cf0ded2aec93e744dd0bd838be0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01daf2aa78d1508af6af8b682498b59f1acea14b32e777d57501e304d8c4f9da +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d6615675ca9f69d9f07495b8090cc443518c547 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1cc6a70df9ae3d1ee7bc929c4c0bbfe4e3731428092e85327b35e30a5ae8237 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5086dae836696305af01de0b638502e70d220332 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc49f0e17fcdddd2eaee705127fa9e712f220206c8f73007ca85e65f1e725fdc +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b10c341c534752c8a43f711ad35e77d10177cf19 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572fd43ac65ad0fa936ca11d5ee944d01cf4b35dcc4992af46eca8847cabe07b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..261d3b91781ccf654958e654abcc4927a41e6f82 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4948cd49ac00314d113278574704e1b5cc94f34b1b2b1178be59a9efa13970 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59c947868950aaa46e2a3119b5dd046eaee51262 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a985abf4386f9d35749282fd267d92c1ea26b4cbfd18b7872354dddefd8a8a57 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..016569b53b1ce7e2ff8a1ac91592a572a998a0b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be01680f5078d9975ab330bee60c14207fbd678d6bb4802404169479ee609ccf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1885afb3725fc0170ede2ef1d86f2e4dc7872a50 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c89989676332620d98f9299db3284bd7484bda9219fe58d938d654e8f786bbf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/latest new file mode 100644 index 0000000000000000000000000000000000000000..87449ff1a854ba4a77ea33fbc24adaed3311d6b1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/latest @@ -0,0 +1 @@ +global_step250 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab29abc7c5c196288fd5c119c67c4f655f27d44c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5c4738c31c5c9a38e1f586256d59a0e8e7d02641b9b9af2afdbe078440aeb4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8e0ba47a098b34da66857368b41c80a5d9d796f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d374b3390eb52ec7f6161c06272d4f26cb715692bdf2ad5374287b6de420ca3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7676e48e7dd332be5f46585fc5f824c5791f76ae --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24111edc5a6a2994166cd410155ee3c630816d0fe21c13808ebd2a2ae45bc9d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..228202ae722c05ed5fafc13eeac33a8a2685cca5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157b21eda1c7f898e519251deed08049767ffba123797289de56343a92ba7380 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a63de21fa3e29782ced5828f8f34fba46bad33 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb615552e5845759bc13aa2ae50c0525fbf941fa76ee2e2c20cb9838fe1995 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d487727115f1120e55e91ad9583fb23ff8e34083 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf720fc22147ce563d6f2c2f6f3d916a7e8b7af174b480d072b5c822e992aa +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90628d8fd79ee2a98fb904251b6d7938f5120b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d055d3b033dc8e6fc2a19aa95162960544ab94a903988874315efe4ed5aa8e13 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e1556a7ec04e7309f4c9130351c880ef6a0626 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e03c685f2e019350bfdd41f006495a18690aacbccd7ffc1f40de827f433eb87 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..871b4a6cbd60ea4b2ef2416f3a46bbe632ddb667 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b1af2ae92a304371e36f6c1b7001f5dafc395be0b17c480957fc7fb58d8cd +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac36b643e84f37eaab2f2aeb7439efa05c23354b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/trainer_state.json @@ -0,0 +1,1172 @@ +{ + "best_metric": 0.40161133, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.606815621196192, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": -0.365234375, + "logits/rejected": -0.314453125, + "logps/chosen": -217.0, + "logps/rejected": -644.0, + "loss": 0.52896728515625, + "memory(GiB)": 77.37, + "nll_loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.5625, + "rewards/margins": 14.125, + "rewards/rejected": 0.48828125, + "step": 85, + "train_speed(iter/s)": 0.065427 + }, + { + "epoch": 1.8, + "grad_norm": 0.603209904087939, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": -0.2021484375, + "logits/rejected": -0.41796875, + "logps/chosen": -236.0, + "logps/rejected": -446.0, + "loss": 0.4052978515625, + "memory(GiB)": 77.37, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.9375, + "rewards/margins": 14.1875, + "rewards/rejected": 0.76953125, + "step": 90, + "train_speed(iter/s)": 0.065028 + }, + { + "epoch": 1.9, + "grad_norm": 0.39872701490711776, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": -0.265625, + "logits/rejected": -0.09130859375, + "logps/chosen": -286.0, + "logps/rejected": -472.0, + "loss": 0.420556640625, + "memory(GiB)": 77.37, + "nll_loss": 0.376953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.453125, + "step": 95, + "train_speed(iter/s)": 0.064964 + }, + { + "epoch": 2.0, + "grad_norm": 0.4636763750262995, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": -0.376953125, + "logits/rejected": -0.2099609375, + "logps/chosen": -214.0, + "logps/rejected": -324.0, + "loss": 0.3860954284667969, + "memory(GiB)": 77.37, + "nll_loss": 0.3515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.4375, + "rewards/rejected": 1.0078125, + "step": 100, + "train_speed(iter/s)": 0.064753 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.0078125, + "eval_logits/rejected": -0.189453125, + "eval_logps/chosen": -5.125, + "eval_logps/rejected": -268.0, + "eval_loss": 0.412109375, + "eval_nll_loss": 0.244140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.8125, + "eval_rewards/margins": 13.125, + "eval_rewards/rejected": -3.296875, + "eval_runtime": 6.9494, + "eval_samples_per_second": 0.576, + "eval_steps_per_second": 0.144, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.3992315577108378, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": -0.36328125, + "logits/rejected": -0.62890625, + "logps/chosen": -336.0, + "logps/rejected": -544.0, + "loss": 0.370050048828125, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 16.5, + "rewards/rejected": 4.34375, + "step": 105, + "train_speed(iter/s)": 0.064078 + }, + { + "epoch": 2.2, + "grad_norm": 0.5929485058503616, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": -0.2275390625, + "logits/rejected": 0.003997802734375, + "logps/chosen": -304.0, + "logps/rejected": -864.0, + "loss": 0.3550140380859375, + "memory(GiB)": 77.37, + "nll_loss": 0.328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.375, + "rewards/margins": 15.8125, + "rewards/rejected": 1.5234375, + "step": 110, + "train_speed(iter/s)": 0.064054 + }, + { + "epoch": 2.3, + "grad_norm": 0.3698356285503532, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": -0.142578125, + "logits/rejected": -0.2099609375, + "logps/chosen": -356.0, + "logps/rejected": -584.0, + "loss": 0.38344573974609375, + "memory(GiB)": 77.37, + "nll_loss": 0.36328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.25, + "rewards/margins": 15.0, + "rewards/rejected": 4.25, + "step": 115, + "train_speed(iter/s)": 0.064216 + }, + { + "epoch": 2.4, + "grad_norm": 0.5467922855501066, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": -0.427734375, + "logits/rejected": -0.470703125, + "logps/chosen": -282.0, + "logps/rejected": -300.0, + "loss": 0.31541900634765624, + "memory(GiB)": 77.37, + "nll_loss": 0.271484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.5, + "rewards/margins": 16.125, + "rewards/rejected": 1.3515625, + "step": 120, + "train_speed(iter/s)": 0.064713 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -1.015625, + "eval_logits/rejected": -0.1611328125, + "eval_logps/chosen": -4.4375, + "eval_logps/rejected": -274.0, + "eval_loss": 0.401611328125, + "eval_nll_loss": 0.2119140625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.875, + "eval_rewards/margins": 13.75, + "eval_rewards/rejected": -3.90625, + "eval_runtime": 7.0901, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.8829625223112105, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -0.96875, + "logits/rejected": -0.287109375, + "logps/chosen": -17.375, + "logps/rejected": -920.0, + "loss": 0.3151878356933594, + "memory(GiB)": 77.37, + "nll_loss": 0.2060546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 13.8125, + "rewards/rejected": -0.353515625, + "step": 125, + "train_speed(iter/s)": 0.064742 + }, + { + "epoch": 2.6, + "grad_norm": 2.121898818349653, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58984375, + "logps/chosen": -302.0, + "logps/rejected": -255.0, + "loss": 0.30629501342773435, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.0, + "rewards/margins": 19.75, + "rewards/rejected": 0.208984375, + "step": 130, + "train_speed(iter/s)": 0.065152 + }, + { + "epoch": 2.7, + "grad_norm": 0.9713606046745735, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.57421875, + "logits/rejected": -0.224609375, + "logps/chosen": -118.0, + "logps/rejected": -560.0, + "loss": 0.2922694206237793, + "memory(GiB)": 77.37, + "nll_loss": 0.111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.125, + "rewards/margins": 15.3125, + "rewards/rejected": -2.203125, + "step": 135, + "train_speed(iter/s)": 0.065246 + }, + { + "epoch": 2.8, + "grad_norm": 0.42207060142345165, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": -0.45703125, + "logits/rejected": -0.49609375, + "logps/chosen": -274.0, + "logps/rejected": -604.0, + "loss": 0.3446833610534668, + "memory(GiB)": 77.37, + "nll_loss": 0.490234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 18.75, + "rewards/rejected": 0.06982421875, + "step": 140, + "train_speed(iter/s)": 0.065537 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.03125, + "eval_logits/rejected": -0.1552734375, + "eval_logps/chosen": -6.28125, + "eval_logps/rejected": -280.0, + "eval_loss": 0.42333984375, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.5, + "eval_runtime": 7.0582, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 1.6580702623605046, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": -0.12255859375, + "logits/rejected": -0.359375, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.3518634796142578, + "memory(GiB)": 77.37, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.25, + "rewards/margins": 18.75, + "rewards/rejected": 1.515625, + "step": 145, + "train_speed(iter/s)": 0.06474 + }, + { + "epoch": 3.0, + "grad_norm": 0.4467300458184829, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 0.0299072265625, + "logits/rejected": -0.400390625, + "logps/chosen": -318.0, + "logps/rejected": -248.0, + "loss": 0.2712591886520386, + "memory(GiB)": 77.37, + "nll_loss": 0.29296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.25, + "rewards/margins": 21.125, + "rewards/rejected": 2.125, + "step": 150, + "train_speed(iter/s)": 0.064444 + }, + { + "epoch": 3.1, + "grad_norm": 0.3431419537396095, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": -0.373046875, + "logits/rejected": -0.443359375, + "logps/chosen": -436.0, + "logps/rejected": -536.0, + "loss": 0.31831893920898435, + "memory(GiB)": 77.37, + "nll_loss": 0.275390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.875, + "rewards/rejected": 0.024658203125, + "step": 155, + "train_speed(iter/s)": 0.064601 + }, + { + "epoch": 3.2, + "grad_norm": 0.32759751569486095, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": -0.39453125, + "logits/rejected": -0.70703125, + "logps/chosen": -324.0, + "logps/rejected": -346.0, + "loss": 0.2718070030212402, + "memory(GiB)": 77.37, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.5, + "rewards/margins": 20.0, + "rewards/rejected": 3.484375, + "step": 160, + "train_speed(iter/s)": 0.064486 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.0, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -6.625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4287109375, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 14.3125, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0148, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.143, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.6366379388596981, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.6796875, + "logits/rejected": -0.21875, + "logps/chosen": -118.5, + "logps/rejected": -456.0, + "loss": 0.2604236602783203, + "memory(GiB)": 77.37, + "nll_loss": 0.10009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.875, + "rewards/margins": 19.375, + "rewards/rejected": -4.53125, + "step": 165, + "train_speed(iter/s)": 0.064349 + }, + { + "epoch": 3.4, + "grad_norm": 0.4478398792618674, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 0.22265625, + "logits/rejected": -0.62109375, + "logps/chosen": -458.0, + "logps/rejected": -169.0, + "loss": 0.22490353584289552, + "memory(GiB)": 77.37, + "nll_loss": 0.30859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.25, + "rewards/margins": 25.5, + "rewards/rejected": 1.8203125, + "step": 170, + "train_speed(iter/s)": 0.064149 + }, + { + "epoch": 3.5, + "grad_norm": 0.43618775126579723, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.5625, + "logits/rejected": 0.06494140625, + "logps/chosen": -254.0, + "logps/rejected": -452.0, + "loss": 0.24174847602844238, + "memory(GiB)": 77.37, + "nll_loss": 0.337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.625, + "rewards/margins": 20.625, + "rewards/rejected": 4.9375, + "step": 175, + "train_speed(iter/s)": 0.064217 + }, + { + "epoch": 3.6, + "grad_norm": 0.328809465093868, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": -0.251953125, + "logits/rejected": -0.640625, + "logps/chosen": -388.0, + "logps/rejected": -251.0, + "loss": 0.30023603439331054, + "memory(GiB)": 77.37, + "nll_loss": 0.43359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 26.25, + "rewards/margins": 25.375, + "rewards/rejected": 0.8359375, + "step": 180, + "train_speed(iter/s)": 0.064542 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.1953125, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -284.0, + "eval_loss": 0.4462890625, + "eval_nll_loss": 0.38671875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.90625, + "eval_runtime": 7.0704, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.141, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.49681405036162596, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": -0.09375, + "logits/rejected": -0.45703125, + "logps/chosen": -358.0, + "logps/rejected": -195.0, + "loss": 0.25345821380615235, + "memory(GiB)": 77.37, + "nll_loss": 0.412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 22.375, + "rewards/rejected": 2.671875, + "step": 185, + "train_speed(iter/s)": 0.064516 + }, + { + "epoch": 3.8, + "grad_norm": 0.5063889084098974, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": -0.60546875, + "logits/rejected": -0.2734375, + "logps/chosen": -183.0, + "logps/rejected": -544.0, + "loss": 0.2541311264038086, + "memory(GiB)": 77.37, + "nll_loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.875, + "rewards/margins": 25.25, + "rewards/rejected": -0.404296875, + "step": 190, + "train_speed(iter/s)": 0.064791 + }, + { + "epoch": 3.9, + "grad_norm": 0.5269282280846967, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": 0.435546875, + "logits/rejected": -0.10546875, + "logps/chosen": -324.0, + "logps/rejected": -556.0, + "loss": 0.21763362884521484, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 22.5, + "rewards/rejected": 0.051513671875, + "step": 195, + "train_speed(iter/s)": 0.06463 + }, + { + "epoch": 4.0, + "grad_norm": 0.30158893716839064, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": -0.0654296875, + "logits/rejected": -0.056884765625, + "logps/chosen": -235.0, + "logps/rejected": -568.0, + "loss": 0.24068713188171387, + "memory(GiB)": 77.37, + "nll_loss": 0.25390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 21.625, + "rewards/rejected": 0.296875, + "step": 200, + "train_speed(iter/s)": 0.064458 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.19921875, + "eval_logps/chosen": -7.90625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.376953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.6875, + "eval_runtime": 7.0218, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.142, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.5197111532124594, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": -0.17578125, + "logits/rejected": -0.1318359375, + "logps/chosen": -270.0, + "logps/rejected": -556.0, + "loss": 0.2583838939666748, + "memory(GiB)": 77.37, + "nll_loss": 0.23828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 23.125, + "rewards/margins": 19.125, + "rewards/rejected": 4.0625, + "step": 205, + "train_speed(iter/s)": 0.064077 + }, + { + "epoch": 4.2, + "grad_norm": 0.6236218136792746, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": -0.2294921875, + "logits/rejected": -0.48828125, + "logps/chosen": -184.0, + "logps/rejected": -220.0, + "loss": 0.2005645751953125, + "memory(GiB)": 77.37, + "nll_loss": 0.208984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.75, + "rewards/margins": 24.0, + "rewards/rejected": 1.8515625, + "step": 210, + "train_speed(iter/s)": 0.064191 + }, + { + "epoch": 4.3, + "grad_norm": 0.47824031212563534, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -0.494140625, + "logits/rejected": 0.1650390625, + "logps/chosen": -117.0, + "logps/rejected": -972.0, + "loss": 0.23559434413909913, + "memory(GiB)": 77.37, + "nll_loss": 0.1142578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 21.625, + "rewards/rejected": -2.890625, + "step": 215, + "train_speed(iter/s)": 0.064213 + }, + { + "epoch": 4.4, + "grad_norm": 0.3885497874914557, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -0.5625, + "logits/rejected": -0.08203125, + "logps/chosen": -70.5, + "logps/rejected": -864.0, + "loss": 0.21706581115722656, + "memory(GiB)": 77.37, + "nll_loss": 0.052978515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 19.625, + "rewards/rejected": -3.921875, + "step": 220, + "train_speed(iter/s)": 0.064109 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.20703125, + "eval_logps/chosen": -8.25, + "eval_logps/rejected": -286.0, + "eval_loss": 0.44921875, + "eval_nll_loss": 0.392578125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.5, + "eval_rewards/margins": 14.625, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0379, + "eval_samples_per_second": 0.568, + "eval_steps_per_second": 0.142, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.38319446798926815, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": -0.12890625, + "logits/rejected": -0.625, + "logps/chosen": -470.0, + "logps/rejected": -201.0, + "loss": 0.23574182987213135, + "memory(GiB)": 77.37, + "nll_loss": 0.486328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 32.25, + "rewards/margins": 27.125, + "rewards/rejected": 5.15625, + "step": 225, + "train_speed(iter/s)": 0.063935 + }, + { + "epoch": 4.6, + "grad_norm": 0.4331863840312884, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.028076171875, + "logits/rejected": -0.5234375, + "logps/chosen": -208.0, + "logps/rejected": -676.0, + "loss": 0.2071385383605957, + "memory(GiB)": 77.37, + "nll_loss": 0.26171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 23.125, + "rewards/rejected": 1.828125, + "step": 230, + "train_speed(iter/s)": 0.063909 + }, + { + "epoch": 4.7, + "grad_norm": 0.7273350214373521, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": -0.58984375, + "logits/rejected": -0.078125, + "logps/chosen": -151.0, + "logps/rejected": -624.0, + "loss": 0.1720048427581787, + "memory(GiB)": 77.37, + "nll_loss": 0.1806640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.75, + "rewards/margins": 21.375, + "rewards/rejected": -2.609375, + "step": 235, + "train_speed(iter/s)": 0.06399 + }, + { + "epoch": 4.8, + "grad_norm": 0.32144115051135386, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": -0.384765625, + "logits/rejected": -0.5234375, + "logps/chosen": -304.0, + "logps/rejected": -556.0, + "loss": 0.20636966228485107, + "memory(GiB)": 77.37, + "nll_loss": 0.439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 28.75, + "rewards/margins": 24.375, + "rewards/rejected": 4.34375, + "step": 240, + "train_speed(iter/s)": 0.064197 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -0.98046875, + "eval_logits/rejected": -0.20703125, + "eval_logps/chosen": -8.4375, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4521484375, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.4375, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0748, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 240 + }, + { + "epoch": 4.9, + "grad_norm": 0.4466988097200344, + "learning_rate": 1.0978021666005478e-07, + "logits/chosen": -0.220703125, + "logits/rejected": -0.5390625, + "logps/chosen": -410.0, + "logps/rejected": -478.0, + "loss": 0.25318150520324706, + "memory(GiB)": 77.37, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.75, + "rewards/margins": 23.5, + "rewards/rejected": 2.328125, + "step": 245, + "train_speed(iter/s)": 0.06414 + }, + { + "epoch": 5.0, + "grad_norm": 0.3187395254363038, + "learning_rate": 0.0, + "logits/chosen": -0.4296875, + "logits/rejected": -0.58984375, + "logps/chosen": -270.0, + "logps/rejected": -420.0, + "loss": 0.2871107816696167, + "memory(GiB)": 77.37, + "nll_loss": 0.298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": 24.125, + "rewards/margins": 23.875, + "rewards/rejected": 0.26171875, + "step": 250, + "train_speed(iter/s)": 0.064317 + }, + { + "epoch": 5.0, + "eval_logits/chosen": -0.9765625, + "eval_logits/rejected": -0.2080078125, + "eval_logps/chosen": -8.4375, + "eval_logps/rejected": -286.0, + "eval_loss": 0.45166015625, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.4375, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -5.09375, + "eval_runtime": 7.0019, + "eval_samples_per_second": 0.571, + "eval_steps_per_second": 0.143, + "step": 250 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1365779831848960.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d67ee3a1235d4fd5826e5b60e8579bd4ef9f315 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb21d6a08ea15124a1dc84b58c466472657c18daec6fb2c523dfe9f9f5d103e +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80875605b9b9e54dc4ca8949c31f972a0c53069d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be8e0b03b943949e4a899a30dffac351eec4381a93328e58fa86753c3085767 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9100d72e813cf3f1d87a21a7b20936a3f70a004c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940ef540b1be746b3c16eba0026215447819d5f6b34c10da6d998de76cd5ed63 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd3e7f58736ce533d95023fb865ae83f92a25c2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4c8566d2bffc6227f3d99b848d9e78d73f6213e88fdd4f20e2d185d4761c0d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d278f1ea3f5770f52e36625bf9d44a4eaa1daf77 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3869cec7a21aabbd73290678bb15089e10ef94069684a731c3072b174be5d3b6 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f60a7285917285f5fd1e566768ea4d977bbe044 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd52aba29c39b79845bd6f27098fa6affbffee55e919ab2e54a65d10ea075c6 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09ee4125aa70d0e8e2bd20bce40f7fd59408b836 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52acbed05db13cc8adbec75a0ae45739eaaa14fd479b1d7f10f3f093bbecbd5f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..448ef9640d72edb6ade57fb74f3102f252017f86 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942c926cae1234903075de20f0e480353e773e3ec90f4ca853161aa8e40385e3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c33b0ed549fdbbe17fc8bbbfdb3e2cefd6911b90 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce9051111fd27fc8a87760ee66dedd2d4f2af45f93923fbae96dd2dd81425ba +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bcffb48ed29cf935e87696b06d3800f19de4ffa --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c8ffd4611bb09396e08d380ff3ce10b0085aee05d094fb471060a45507e1ea +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9754dc4cea53aa9bc44de5358330e3b1e1475be6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e25bca31d4189fce4d27b2e0ecc9a65725427d6ea102128c1eab6be28d7ba8 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e2efca5c9bfc8e701f74a206aab727a4bfe8907 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547af713cb635c93bc314e50ef3f97bb67a5c0b2c1db4da2ba4daa9eb54be31e +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e6ee38440e2979effa4f34ba2c9d18a28633c4d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28eaa9cec67ff4a549999914d0e6318bf95f75e02114d276b83d75ddf83773ab +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6450e6693c793db3c0635360f8f46be545e3f06c --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292d7133c261a316449969f7acb1c8765251b9c91a483e0cfdebc5902487e90d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64b019aade36ffb046a822b6feabefbe9d71ba6f --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422be9302c3d118d4908e04c7014d15e9697c557e19307a0e0706b4532c9328d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ccdee1cb5a4f76e3c6215cb6aec5de27d26851 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68869f865c86ce9f47b6a66ffc7ea070031db04d2458cbe22bea681d78c1516 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..393ad6b3854075efb4a2a4e7aa2d888ebb7932b4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89dd26b5d1c7d6cc6f29cc02b865025ea51c3bed626dd096bbbc6e1cb3e2321f +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..35252259eb09d8de259231f63f19e786e44bc7b0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb7d8df6ed170dd98dba8737bc9dd038af61afd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e144a445ffd57fbb5be9b5131f17149bde6c4ff5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10f35268ac2a0cb68abc0b78ba5b150b0f29d78 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ef21562e384e0889ec2400e8f84b6b0bc59035 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..126662e5f97dd3f9cd7fb87e70843d3d5532dde3 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4e6b27cc99b0fa8e6bbf967892f9304b444d81d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..14e214a61e5311f2b7edf2200ec0365ed2dcc5e1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f899c511136dcc56c12c5058062d17686812d1b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab67fe2c6ce9ef0b836ccb0828d5bb1b593055af205b40af49c85139e46a6c8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2e1becefb63299f53cd3f496ca59199016545b0a --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.44287109, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40", + "epoch": 0.8, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 219708701278208.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5e74a70b8380d675d64c4382eca773a703482ebd --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee4f381ac9a995bb86f8e319829ee47093b447100146c8f4042a8f96025f8fcf +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62aaec5300ec2bc30ed9e0b0ba0037c8946e3be8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93e6b5192100e7fb1c5fd8ba93e27e0d2975a343be7e85d4ae389ac309f6fcb +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..abe14fbbef36b23fc8ed1ae1cccaadd9b8d8f967 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf5c2e51abbcd9101d00c6ccd6852c0149c3c3b13bfe5ab8089fd6143910fe91 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e77f5edd2722c97fbd9985140e45a73ebc56dd7e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef0bc51d160ebb860ea47731a9a9c88341302d972e85ab58b24582295129150 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5f39486d25874453cfd0b9952825853b92ac581 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b59f6ff1036e2fd1d774281a5b3b7c5c24c5432b28ce3d8a65e84b124222584 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab64eb8db10be0bb34097a64159665deb6ceb694 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f102d352689e095815ef6a8dcac74df7ea1160cb9dbd3ccab54440e0813f02e +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f67637ab5e0dee6d3153f8d5e6f9c62fd40464f0 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a448fe1df268b15bd0f3fdabf3d3294f77d184efe2babd9e2921e864d3fab2d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89a73f10ea751e85b6333a2bacee86eee74baa7b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921743565fd54e947d87bb05426223dcec9df84789d6d4f765979cc679f1ac87 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9defaf51b8156d0c25860257dd8064064232e62 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38b852bbe1dd1a49f12ca7bfff8c62bc5a8ba561699ce0108fed54215679bbae +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29deaea9c58661ef97ee870255342e1d172aac83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c284720c6a155eaa9f223a7164257bee0a6858a14e74d7dd38421bebb3be828 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d41cb8972ddf880b3c1a50ddacc4334a8a70f64 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4156d535ba2e53127fc1c9178c60f68f966a337f784b0bcd2fc4a2a4ad9880bf +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b12bf3bb62c4250241f7bedf81dbe10b3767a994 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740baf51b654bd86129f67a1e19c8064e692d262a8c0f783c36439e62cbb93bd +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74a3f4bd80fabe6858c395d9eb1b00aa3ad40a42 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d567cb37ea33f6ebec8c58a4c34940f780218e0021986c8dbe911d7ad9ec996d +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..754870cad9171c351d5adf5ee1e56900ce5b2411 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a30d0eac44269c76d36e28c38af1aa7363606a24d80ebf26bbe124c16fcc3b +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e446bf584ba9a71d6e633c8b7847cf5191b6837d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d468b3f5b562b3fe888d3898ffeffa1262e040b9b6e68273ac0d87116a063b4 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ee8344c8591d75ecb708e55b2ce7f169dcd718 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41b50466f79426e77fbf5d50d6c30919e31c9974901356eefb01f2898a7e163 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74adff193bd72452111546fef8b88d12977c0d29 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82879572d7ae33455092506af3e3daa827253fcdd7b0ceab4b9d927debbbff3 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..728c3241a49cbd920d5df86255fc8be4d97c5519 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa3ba485fff4300fd9029c17ba92c92630af852a00df0a0e8d16c233f74cbc8 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b6c1f963310a025d27ee4ff6d297cd9168bcf1b5 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.421875, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60", + "epoch": 1.2, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 326372966268928.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/README.md b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f90914c42451e80325499acac081462672ee39e9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_model.safetensors b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cfbc657f026f7dff4997ec63b89050dce7221424 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb28604031011736cd1cd2aae1d48964a8c0825ccae8f7168077c3034e430d0c +size 207244392 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/additional_config.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/args.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..931c5dd73b310726a28ff16bfde9d41041b8a8a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_what.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e00c52284970df2e90cef07bbde472bd773c3de9 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc0c3c7131b020bc4a7d030663fe2e97641e43b20b4c1f4f0e6ce5e61773fd9 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f49004efb7b54853aad712ecf7c7dc1639fafb04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581662dc27110bc3bbca4bc3f24b2914a79a36af6f91e9441b9440099a366f9f +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fb4d50d2bfaea1ab9bca617d2531ecbcc44ec25 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a7b9374bb206d62f727171a421f8ad84df38e955587d97f1e21f261b82e0a4d +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0dbe9ae00a8fcd0677ed91b76f756ae4313fb80 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6436c5f95a5e3675858ac96c9be3fb3759d73dc0b59f11647ff4d125f505f7c +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eec879be464d4755aa94a4a0ff0e482cd5995f1b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c9274809bb8b624e00caef5d05a70683df737cab94b8bfb66e773b87259e51 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae6e24167229a58a3fdb5813a86311158800e795 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:034af2bfbdc6cdcc535ca2caf19d025a5aecc46735b48725bdc547d6cb699e7a +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4755cf62ec172a33646d360a86c4a7ae53f120c6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6979ab352238c68cd562b3ecb275a180dcf64ea2383dc44ebbe50701419e3d3 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..431d2342b48fa703432dd0d4e5bdfaefe1e2138b --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc659be6851a86fc08dc604773bc513680bc525ac89846bd22f105cffebaa19 +size 155324336 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73e38c53f23f63a98ec9581d947d5afd6a2e98db --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e7de4e3ccba42c909313ed26f149051f892f5861085cecf4a3cf12076740a2 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f0bf213154388ad221c77d7a4bf8f700f5e4534 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89e657a3fa1296cf8b2bc16e5d67732135b80ce4671a2770264ce51ed3513b71 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..765a4c2fefda0cac967cbd088abeba9c8413d9c1 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc4ae517cd158c8d9476b726d4c4309889718bca4c975a54764f6f58671baa6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7bbef9f6866dfdeb0c1b8f50404d66327426e86 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea66f33da2f5c3ca1aeb17297dc91104cf4c07d8f73e0c159761208eae994148 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50d3758a8d27aff7b69ce3ec2720332a6ed925b6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71fd8aaf20583187252ea5247d72cac1fa3c08a3f5034bf612f5a1f2ba5aa5f +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f354e5143107b1786a195973935715e516ed6288 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ddc9385ff0b7b3654dc493f0db74012aab5c3cf20551f2f15e49926566e9ef6 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a278b47a4846eba296205bed86e9d5885783a6 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb720a6b01bbb827922d7cc16f85e38ebd798c724910a1e0aa03f27f054b45db +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11fbbd7f76eaaccb51396011384493cd3e3ab681 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02027957976e2c5706e7776260b7e1dfd70c633e9d6c5c538b1b0ef230442315 +size 1107846 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/latest b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_0.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd2a62da4ca83b3b986d96dbf0eaeb82207ca93 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0628a9017696045a3a29e9eaffc71e9262d855716e773c0c3be760a1fe85bc8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_1.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ba5f3aba4388a582cd47f7f9e57cd5879b1cbd2 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df342004a4d8e3626bf2a9f689fde7c8bfd6d995e14931f5496eda1f456cb6f2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_2.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27b0f7845c2b9530c3e6ed3ce232ff4e86b86122 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02096eb4e8850b91490e80e4a042e2e60f71bd2abc6a269d62c271649cb77d2 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_3.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfb583fc43c6dd4395671708744cfd18c419970 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c778d3d0e7e3d5665fa0a9ecd92986609c430da08b41611d6c05dc19815a8 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_4.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a8c64b1f15ac655b2be2a42fe61cabe2a877704 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d978dcb0c34e022ee6750e9d86814b8c82e4965d7e07662f35f06eeac12938f3 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_5.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..262e8187e6caeca12ef3b0aa923b12afd697e03d --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e83399aed1d9d173c3e07b2efa8530c956b62b2b68394c2ed0d43bd8bba9d1 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_6.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..72f794e31f8d3e0c63972e5076e1ed90c52087ba --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606ab3ca92e3d20c327c69fdcce7f7e39bec2f2c3538b036088b255f917e3ba4 +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_7.pth b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..244e7fdaa1cef2e82bd4e16afb10f32f68318bcc --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276a987dd22c9093fec58921ba19f340a28f18bff635cc01324e09a3c37ac3a +size 15984 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/scheduler.pt b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e36a588df493151f57c8f73aa08129a3810c2c7 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee30cdff92a069fa950619177f737b278c096bc7c83c0e5bdea15a673218022 +size 1064 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/trainer_state.json b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d40a78733fb10260e62a8bc8bab0f3384e819166 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.421875, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-60", + "epoch": 1.6, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 13.5144849172613, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": -0.494140625, + "logits/rejected": -0.50390625, + "logps/chosen": -490.0, + "logps/rejected": -156.0, + "loss": 2.4296875, + "memory(GiB)": 28.09, + "nll_loss": 0.50390625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.049656 + }, + { + "epoch": 0.1, + "grad_norm": 13.984602895852001, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 1.5625, + "logits/rejected": 0.10693359375, + "logps/chosen": -572.0, + "logps/rejected": -234.0, + "loss": 2.275634765625, + "memory(GiB)": 31.11, + "nll_loss": 0.765625, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2001953125, + "rewards/margins": -0.1875, + "rewards/rejected": -0.01251220703125, + "step": 5, + "train_speed(iter/s)": 0.070981 + }, + { + "epoch": 0.2, + "grad_norm": 9.879185725004177, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.408203125, + "logits/rejected": -0.40625, + "logps/chosen": -488.0, + "logps/rejected": -468.0, + "loss": 1.98564453125, + "memory(GiB)": 34.78, + "nll_loss": 1.2578125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.37890625, + "rewards/rejected": 0.169921875, + "step": 10, + "train_speed(iter/s)": 0.065781 + }, + { + "epoch": 0.3, + "grad_norm": 6.6662069729619855, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.061279296875, + "logits/rejected": 0.05078125, + "logps/chosen": -462.0, + "logps/rejected": -928.0, + "loss": 1.342333984375, + "memory(GiB)": 61.22, + "nll_loss": 0.9140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.90625, + "rewards/margins": 1.9609375, + "rewards/rejected": 1.9375, + "step": 15, + "train_speed(iter/s)": 0.061185 + }, + { + "epoch": 0.4, + "grad_norm": 1.470369288221021, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": -0.51953125, + "logits/rejected": -0.453125, + "logps/chosen": -300.0, + "logps/rejected": -290.0, + "loss": 1.189111328125, + "memory(GiB)": 61.22, + "nll_loss": 0.482421875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 7.03125, + "rewards/margins": 2.4375, + "rewards/rejected": 4.59375, + "step": 20, + "train_speed(iter/s)": 0.062635 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": -0.462890625, + "eval_logps/chosen": -7.125, + "eval_logps/rejected": -178.0, + "eval_loss": 0.490234375, + "eval_nll_loss": 0.33984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 3.9375, + "eval_rewards/rejected": 5.6875, + "eval_runtime": 7.0921, + "eval_samples_per_second": 0.564, + "eval_steps_per_second": 0.141, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.7360842799133988, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": -0.380859375, + "logits/rejected": -0.376953125, + "logps/chosen": -254.0, + "logps/rejected": -372.0, + "loss": 0.6099853515625, + "memory(GiB)": 65.14, + "nll_loss": 0.4453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.125, + "rewards/margins": 8.0625, + "rewards/rejected": 3.078125, + "step": 25, + "train_speed(iter/s)": 0.061887 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031711214032629, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.78515625, + "logits/rejected": -0.7109375, + "logps/chosen": -314.0, + "logps/rejected": -584.0, + "loss": 0.522998046875, + "memory(GiB)": 65.14, + "nll_loss": 0.6328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.8125, + "rewards/margins": 9.5625, + "rewards/rejected": 1.234375, + "step": 30, + "train_speed(iter/s)": 0.063962 + }, + { + "epoch": 0.7, + "grad_norm": 1.192561076964146, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": -0.263671875, + "logits/rejected": -0.58203125, + "logps/chosen": -420.0, + "logps/rejected": -201.0, + "loss": 0.52447509765625, + "memory(GiB)": 65.14, + "nll_loss": 0.68359375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.125, + "rewards/margins": 11.1875, + "rewards/rejected": 3.953125, + "step": 35, + "train_speed(iter/s)": 0.065833 + }, + { + "epoch": 0.8, + "grad_norm": 0.7003125541995815, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": -0.6875, + "logits/rejected": -0.51171875, + "logps/chosen": -174.0, + "logps/rejected": -342.0, + "loss": 0.461572265625, + "memory(GiB)": 65.14, + "nll_loss": 0.52734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.625, + "rewards/margins": 10.1875, + "rewards/rejected": 2.484375, + "step": 40, + "train_speed(iter/s)": 0.064604 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.94921875, + "eval_logits/rejected": -0.326171875, + "eval_logps/chosen": -6.5625, + "eval_logps/rejected": -230.0, + "eval_loss": 0.44287109375, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.625, + "eval_rewards/margins": 9.125, + "eval_rewards/rejected": 0.5, + "eval_runtime": 7.0822, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.141, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.6240116852703075, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": -0.51171875, + "logits/rejected": -0.73828125, + "logps/chosen": -378.0, + "logps/rejected": -416.0, + "loss": 0.45257568359375, + "memory(GiB)": 72.04, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.25, + "rewards/margins": 15.0, + "rewards/rejected": 3.203125, + "step": 45, + "train_speed(iter/s)": 0.063816 + }, + { + "epoch": 1.0, + "grad_norm": 0.9089784106511326, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": -0.2177734375, + "logits/rejected": -0.01324462890625, + "logps/chosen": -386.0, + "logps/rejected": -1000.0, + "loss": 0.44512939453125, + "memory(GiB)": 72.04, + "nll_loss": 0.390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.0625, + "rewards/margins": 11.875, + "rewards/rejected": 1.1171875, + "step": 50, + "train_speed(iter/s)": 0.064594 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342183245129553, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": -0.08154296875, + "logits/rejected": -0.4296875, + "logps/chosen": -314.0, + "logps/rejected": -604.0, + "loss": 0.4583740234375, + "memory(GiB)": 77.37, + "nll_loss": 0.4765625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 17.5, + "rewards/rejected": 1.5078125, + "step": 55, + "train_speed(iter/s)": 0.064662 + }, + { + "epoch": 1.2, + "grad_norm": 0.3307989428519516, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": -0.275390625, + "logits/rejected": -0.2734375, + "logps/chosen": -177.0, + "logps/rejected": -540.0, + "loss": 0.415283203125, + "memory(GiB)": 77.37, + "nll_loss": 0.32421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.0, + "rewards/rejected": -1.1875, + "step": 60, + "train_speed(iter/s)": 0.065074 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.8828125, + "eval_logits/rejected": -0.138671875, + "eval_logps/chosen": -5.5, + "eval_logps/rejected": -270.0, + "eval_loss": 0.421875, + "eval_nll_loss": 0.26171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.75, + "eval_rewards/margins": 13.25, + "eval_rewards/rejected": -3.5, + "eval_runtime": 7.0588, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.142, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.6235236115981455, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": -0.2578125, + "logits/rejected": 0.048583984375, + "logps/chosen": -250.0, + "logps/rejected": -708.0, + "loss": 0.46671142578125, + "memory(GiB)": 77.37, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.6875, + "rewards/margins": 15.0, + "rewards/rejected": -1.3515625, + "step": 65, + "train_speed(iter/s)": 0.064345 + }, + { + "epoch": 1.4, + "grad_norm": 0.4510421974711146, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.765625, + "logits/rejected": -0.09765625, + "logps/chosen": -48.0, + "logps/rejected": -876.0, + "loss": 0.3884857177734375, + "memory(GiB)": 77.37, + "nll_loss": 0.50390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 12.0, + "rewards/rejected": -0.7578125, + "step": 70, + "train_speed(iter/s)": 0.065067 + }, + { + "epoch": 1.5, + "grad_norm": 0.46094113890076227, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": -0.34375, + "logits/rejected": -0.388671875, + "logps/chosen": -344.0, + "logps/rejected": -376.0, + "loss": 0.4062408447265625, + "memory(GiB)": 77.37, + "nll_loss": 0.353515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.75, + "rewards/margins": 17.25, + "rewards/rejected": 0.51953125, + "step": 75, + "train_speed(iter/s)": 0.065208 + }, + { + "epoch": 1.6, + "grad_norm": 0.3027474591177724, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 0.28515625, + "logits/rejected": -0.458984375, + "logps/chosen": -426.0, + "logps/rejected": -107.0, + "loss": 0.427960205078125, + "memory(GiB)": 77.37, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.125, + "rewards/margins": 17.5, + "rewards/rejected": 4.75, + "step": 80, + "train_speed(iter/s)": 0.065989 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.859375, + "eval_logits/rejected": -0.11328125, + "eval_logps/chosen": -5.96875, + "eval_logps/rejected": -266.0, + "eval_loss": 0.4228515625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.6875, + "eval_rewards/margins": 12.75, + "eval_rewards/rejected": -3.09375, + "eval_runtime": 7.0664, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.142, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 433133473759232.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/training_args.bin b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d92b3c133dfedef3e30ca551a7ed439168f4b916 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc84e174c65f48dd965e12ce84cfe0c32ab50ff9b8703539d9debd3ec9753fe +size 8888 diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/zero_to_fp32.py b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..7f7de30cf38afb4a026196a0933a28dd3ecbd462 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..726f16ce105c93473b5de48aad2edb9c9a4d9a50 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..224ac3b2d75eecaa2a3eebd93a777152b3238bc4 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..319d106bf7c8131cbab2d3d2948a1c95da09f401 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..ca9c45c9b2fecc42a825d6315599fcbf60bbd311 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_nll_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..de63cfcc0683f286a110f3ce02ddaf14a60ab43c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd3f332a0a9276590b2d2aad6821619831dd64b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..78a170c9152f283ec7a05dff325f4ce032d76f4c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..98f3c9c63671592cbaf8cbafeb39b8592066fd71 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..7f06b23be43f92941e4903855da536320b2cc65c Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_runtime.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..070c810799f7a2b9cfade1da5b74476eb0d60cec Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..1fdb469efa68815e6c936ddf5c55cfac3a80c8d5 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..ae8cdbd923fca461146b194ba28deab0b2e7afb1 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/eval_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_epoch.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..97933611536b8acda691b8294659ee4282ded3e6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_epoch.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_grad_norm.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..8182aafa25e6c5c21af450f2dade27a62cb48188 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_grad_norm.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_learning_rate.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..f869fa37203441700940366c27c7d20493a62bae Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_learning_rate.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..535b2da4a3a86624eb8315f6530afcad783009be Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c5ff17ec2b31b89352053c1b1b651ef657dda5ed Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logits_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..bb7e9bdd2fecefce0bc8c5c3ae39d70903da4354 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3bdea87e61faebc910ee9eb9a0e4f6668e6abddf Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_logps_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..9ee9416e99ff7b77681c50d86ec09101426d9191 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_memory(GiB).png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..c98ee6f067809d7442c310464c0f08b39bcda2d6 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_memory(GiB).png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_nll_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a7e4768d20d2cccc1104947b44d0081fea8feea7 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_nll_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_accuracies.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..b4911fa40be13e773f077812bd48e71ac255de9d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_accuracies.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_chosen.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..1972ce6bfaa017ffbb925090a9b4b4a7a4935754 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_chosen.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_margins.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..27c12814d097329e9eca9ecc3735e5b5179c99c8 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_margins.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_rejected.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3db9587484f38c1b80c917624fea277a8ecfd072 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_rewards_rejected.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_total_flos.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..a62f6f1400454bbb42a2de4b7772c4968ad7fb4d Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_total_flos.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_loss.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..90a099dbcc7608620e00cef38efe02ae3e753b0b Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_loss.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_runtime.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..0117b82c641ff1feac2e167f8734f62d2d622539 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_runtime.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_samples_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..db1020930f21c867d7146a49d6b7562ac7268d02 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_speed(iter_s).png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..67570ab25318dd044426f86f12c793f584245806 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_steps_per_second.png b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..59fb7ab74986ea62036641b005a090372c102545 Binary files /dev/null and b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/logging.jsonl b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94d2b9d9b32f07a9e9b73a608f2abbfad03a6007 --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/logging.jsonl @@ -0,0 +1,66 @@ +{"loss": 2.4296875, "grad_norm": 13.51448492, "learning_rate": 7.69e-06, "memory(GiB)": 28.09, "train_speed(iter/s)": 0.049656, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -156.0, "logps/chosen": -490.0, "logits/rejected": -0.50390625, "logits/chosen": -0.49414062, "nll_loss": 0.50390625, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "15s", "remaining_time": "1h 4m 6s"} +{"loss": 2.27563477, "grad_norm": 13.9846029, "learning_rate": 3.846e-05, "memory(GiB)": 31.11, "train_speed(iter/s)": 0.070981, "rewards/chosen": -0.20019531, "rewards/rejected": -0.01251221, "rewards/accuracies": 0.0, "rewards/margins": -0.1875, "logps/rejected": -234.0, "logps/chosen": -572.0, "logits/rejected": 0.10693359, "logits/chosen": 1.5625, "nll_loss": 0.765625, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "1m 5s", "remaining_time": "53m 41s"} +{"loss": 1.98564453, "grad_norm": 9.87918573, "learning_rate": 7.692e-05, "memory(GiB)": 34.78, "train_speed(iter/s)": 0.065781, "rewards/chosen": 0.55078125, "rewards/rejected": 0.16992188, "rewards/accuracies": 0.60000002, "rewards/margins": 0.37890625, "logps/rejected": -468.0, "logps/chosen": -488.0, "logits/rejected": -0.40625, "logits/chosen": -0.40820312, "nll_loss": 1.2578125, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "2m 27s", "remaining_time": "58m 55s"} +{"loss": 1.34233398, "grad_norm": 6.66620697, "learning_rate": 9.998e-05, "memory(GiB)": 61.22, "train_speed(iter/s)": 0.061185, "rewards/chosen": 3.90625, "rewards/rejected": 1.9375, "rewards/accuracies": 1.0, "rewards/margins": 1.9609375, "logps/rejected": -928.0, "logps/chosen": -462.0, "logits/rejected": 0.05078125, "logits/chosen": -0.0612793, "nll_loss": 0.9140625, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "4m 0s", "remaining_time": "1h 2m 47s"} +{"loss": 1.18911133, "grad_norm": 1.47036929, "learning_rate": 9.978e-05, "memory(GiB)": 61.22, "train_speed(iter/s)": 0.062635, "rewards/chosen": 7.03125, "rewards/rejected": 4.59375, "rewards/accuracies": 0.80000001, "rewards/margins": 2.4375, "logps/rejected": -290.0, "logps/chosen": -300.0, "logits/rejected": -0.453125, "logits/chosen": -0.51953125, "nll_loss": 0.48242188, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "5m 14s", "remaining_time": "1h 0m 18s"} +{"eval_loss": 0.49023438, "eval_runtime": 7.0921, "eval_samples_per_second": 0.564, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": 5.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.9375, "eval_logps/rejected": -178.0, "eval_logps/chosen": -7.125, "eval_logits/rejected": -0.46289062, "eval_logits/chosen": -0.8046875, "eval_nll_loss": 0.33984375, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "5m 21s", "remaining_time": "1h 1m 39s"} +{"loss": 0.60998535, "grad_norm": 0.73608428, "learning_rate": 9.937e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.061887, "rewards/chosen": 11.125, "rewards/rejected": 3.078125, "rewards/accuracies": 1.0, "rewards/margins": 8.0625, "logps/rejected": -372.0, "logps/chosen": -254.0, "logits/rejected": -0.37695312, "logits/chosen": -0.38085938, "nll_loss": 0.4453125, "epoch": 0.5, "global_step/max_steps": "25/250", "percentage": "10.00%", "elapsed_time": "6m 39s", "remaining_time": "59m 53s"} +{"loss": 0.52299805, "grad_norm": 0.60317112, "learning_rate": 9.874e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.063962, "rewards/chosen": 10.8125, "rewards/rejected": 1.234375, "rewards/accuracies": 1.0, "rewards/margins": 9.5625, "logps/rejected": -584.0, "logps/chosen": -314.0, "logits/rejected": -0.7109375, "logits/chosen": -0.78515625, "nll_loss": 0.6328125, "epoch": 0.6, "global_step/max_steps": "30/250", "percentage": "12.00%", "elapsed_time": "7m 44s", "remaining_time": "56m 45s"} +{"loss": 0.5244751, "grad_norm": 1.19256108, "learning_rate": 9.789e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.065833, "rewards/chosen": 15.125, "rewards/rejected": 3.953125, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -201.0, "logps/chosen": -420.0, "logits/rejected": -0.58203125, "logits/chosen": -0.26367188, "nll_loss": 0.68359375, "epoch": 0.7, "global_step/max_steps": "35/250", "percentage": "14.00%", "elapsed_time": "8m 46s", "remaining_time": "53m 57s"} +{"loss": 0.46157227, "grad_norm": 0.70031255, "learning_rate": 9.683e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.064604, "rewards/chosen": 12.625, "rewards/rejected": 2.484375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -342.0, "logps/chosen": -174.0, "logits/rejected": -0.51171875, "logits/chosen": -0.6875, "nll_loss": 0.52734375, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "10m 14s", "remaining_time": "53m 45s"} +{"eval_loss": 0.44287109, "eval_runtime": 7.0822, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": 0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -6.5625, "eval_logits/rejected": -0.32617188, "eval_logits/chosen": -0.94921875, "eval_nll_loss": 0.3125, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "10m 21s", "remaining_time": "54m 23s"} +{"loss": 0.45257568, "grad_norm": 0.62401169, "learning_rate": 9.557e-05, "memory(GiB)": 72.04, "train_speed(iter/s)": 0.063816, "rewards/chosen": 18.25, "rewards/rejected": 3.203125, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -416.0, "logps/chosen": -378.0, "logits/rejected": -0.73828125, "logits/chosen": -0.51171875, "nll_loss": 0.4375, "epoch": 0.9, "global_step/max_steps": "45/250", "percentage": "18.00%", "elapsed_time": "11m 40s", "remaining_time": "53m 10s"} +{"loss": 0.44512939, "grad_norm": 0.90897841, "learning_rate": 9.411e-05, "memory(GiB)": 72.04, "train_speed(iter/s)": 0.064594, "rewards/chosen": 13.0625, "rewards/rejected": 1.1171875, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -1000.0, "logps/chosen": -386.0, "logits/rejected": -0.01324463, "logits/chosen": -0.21777344, "nll_loss": 0.390625, "epoch": 1.0, "global_step/max_steps": "50/250", "percentage": "20.00%", "elapsed_time": "12m 49s", "remaining_time": "51m 17s"} +{"loss": 0.45837402, "grad_norm": 0.53421832, "learning_rate": 9.245e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064662, "rewards/chosen": 19.0, "rewards/rejected": 1.5078125, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -604.0, "logps/chosen": -314.0, "logits/rejected": -0.4296875, "logits/chosen": -0.08154297, "nll_loss": 0.4765625, "epoch": 1.1, "global_step/max_steps": "55/250", "percentage": "22.00%", "elapsed_time": "14m 5s", "remaining_time": "49m 59s"} +{"loss": 0.4152832, "grad_norm": 0.33079894, "learning_rate": 9.061e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065074, "rewards/chosen": 13.8125, "rewards/rejected": -1.1875, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -540.0, "logps/chosen": -177.0, "logits/rejected": -0.2734375, "logits/chosen": -0.27539062, "nll_loss": 0.32421875, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "15m 17s", "remaining_time": "48m 24s"} +{"eval_loss": 0.421875, "eval_runtime": 7.0588, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.75, "eval_rewards/rejected": -3.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.25, "eval_logps/rejected": -270.0, "eval_logps/chosen": -5.5, "eval_logits/rejected": -0.13867188, "eval_logits/chosen": -0.8828125, "eval_nll_loss": 0.26171875, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "15m 24s", "remaining_time": "48m 47s"} +{"loss": 0.46671143, "grad_norm": 0.62352361, "learning_rate": 8.858e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064345, "rewards/chosen": 13.6875, "rewards/rejected": -1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -708.0, "logps/chosen": -250.0, "logits/rejected": 0.04858398, "logits/chosen": -0.2578125, "nll_loss": 0.47070312, "epoch": 1.3, "global_step/max_steps": "65/250", "percentage": "26.00%", "elapsed_time": "16m 45s", "remaining_time": "47m 41s"} +{"loss": 0.38848572, "grad_norm": 0.4510422, "learning_rate": 8.639e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065067, "rewards/chosen": 11.25, "rewards/rejected": -0.7578125, "rewards/accuracies": 1.0, "rewards/margins": 12.0, "logps/rejected": -876.0, "logps/chosen": -48.0, "logits/rejected": -0.09765625, "logits/chosen": -0.765625, "nll_loss": 0.50390625, "epoch": 1.4, "global_step/max_steps": "70/250", "percentage": "28.00%", "elapsed_time": "17m 51s", "remaining_time": "45m 54s"} +{"loss": 0.40624084, "grad_norm": 0.46094114, "learning_rate": 8.404e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065208, "rewards/chosen": 17.75, "rewards/rejected": 0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 17.25, "logps/rejected": -376.0, "logps/chosen": -344.0, "logits/rejected": -0.38867188, "logits/chosen": -0.34375, "nll_loss": 0.35351562, "epoch": 1.5, "global_step/max_steps": "75/250", "percentage": "30.00%", "elapsed_time": "19m 5s", "remaining_time": "44m 32s"} +{"loss": 0.42796021, "grad_norm": 0.30274746, "learning_rate": 8.154e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065989, "rewards/chosen": 22.125, "rewards/rejected": 4.75, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -107.0, "logps/chosen": -426.0, "logits/rejected": -0.45898438, "logits/chosen": 0.28515625, "nll_loss": 0.5078125, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "20m 7s", "remaining_time": "42m 46s"} +{"eval_loss": 0.42285156, "eval_runtime": 7.0664, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.6875, "eval_rewards/rejected": -3.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.75, "eval_logps/rejected": -266.0, "eval_logps/chosen": -5.96875, "eval_logits/rejected": -0.11328125, "eval_logits/chosen": -0.859375, "eval_nll_loss": 0.28515625, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "20m 14s", "remaining_time": "43m 1s"} +{"loss": 0.52896729, "grad_norm": 0.60681562, "learning_rate": 7.89e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065427, "rewards/chosen": 14.5625, "rewards/rejected": 0.48828125, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -644.0, "logps/chosen": -217.0, "logits/rejected": -0.31445312, "logits/chosen": -0.36523438, "nll_loss": 0.3125, "epoch": 1.7, "global_step/max_steps": "85/250", "percentage": "34.00%", "elapsed_time": "21m 34s", "remaining_time": "41m 52s"} +{"loss": 0.40529785, "grad_norm": 0.6032099, "learning_rate": 7.614e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065028, "rewards/chosen": 14.9375, "rewards/rejected": 0.76953125, "rewards/accuracies": 1.0, "rewards/margins": 14.1875, "logps/rejected": -446.0, "logps/chosen": -236.0, "logits/rejected": -0.41796875, "logits/chosen": -0.20214844, "nll_loss": 0.45703125, "epoch": 1.8, "global_step/max_steps": "90/250", "percentage": "36.00%", "elapsed_time": "22m 59s", "remaining_time": "40m 52s"} +{"loss": 0.42055664, "grad_norm": 0.39872701, "learning_rate": 7.326e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064964, "rewards/chosen": 17.75, "rewards/rejected": 0.453125, "rewards/accuracies": 1.0, "rewards/margins": 17.25, "logps/rejected": -472.0, "logps/chosen": -286.0, "logits/rejected": -0.09130859, "logits/chosen": -0.265625, "nll_loss": 0.37695312, "epoch": 1.9, "global_step/max_steps": "95/250", "percentage": "38.00%", "elapsed_time": "24m 17s", "remaining_time": "39m 38s"} +{"loss": 0.38609543, "grad_norm": 0.46367638, "learning_rate": 7.028e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064753, "rewards/chosen": 14.4375, "rewards/rejected": 1.0078125, "rewards/accuracies": 1.0, "rewards/margins": 13.4375, "logps/rejected": -324.0, "logps/chosen": -214.0, "logits/rejected": -0.20996094, "logits/chosen": -0.37695312, "nll_loss": 0.3515625, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "25m 39s", "remaining_time": "38m 29s"} +{"eval_loss": 0.41210938, "eval_runtime": 6.9494, "eval_samples_per_second": 0.576, "eval_steps_per_second": 0.144, "eval_rewards/chosen": 9.8125, "eval_rewards/rejected": -3.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.125, "eval_logps/rejected": -268.0, "eval_logps/chosen": -5.125, "eval_logits/rejected": -0.18945312, "eval_logits/chosen": -1.0078125, "eval_nll_loss": 0.24414062, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "25m 46s", "remaining_time": "38m 39s"} +{"loss": 0.37005005, "grad_norm": 0.39923156, "learning_rate": 6.72e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064078, "rewards/chosen": 20.875, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 16.5, "logps/rejected": -544.0, "logps/chosen": -336.0, "logits/rejected": -0.62890625, "logits/chosen": -0.36328125, "nll_loss": 0.43945312, "epoch": 2.1, "global_step/max_steps": "105/250", "percentage": "42.00%", "elapsed_time": "27m 13s", "remaining_time": "37m 36s"} +{"loss": 0.35501404, "grad_norm": 0.59294851, "learning_rate": 6.406e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064054, "rewards/chosen": 17.375, "rewards/rejected": 1.5234375, "rewards/accuracies": 1.0, "rewards/margins": 15.8125, "logps/rejected": -864.0, "logps/chosen": -304.0, "logits/rejected": 0.0039978, "logits/chosen": -0.22753906, "nll_loss": 0.328125, "epoch": 2.2, "global_step/max_steps": "110/250", "percentage": "44.00%", "elapsed_time": "28m 32s", "remaining_time": "36m 19s"} +{"loss": 0.38344574, "grad_norm": 0.36983563, "learning_rate": 6.085e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064216, "rewards/chosen": 19.25, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -584.0, "logps/chosen": -356.0, "logits/rejected": -0.20996094, "logits/chosen": -0.14257812, "nll_loss": 0.36328125, "epoch": 2.3, "global_step/max_steps": "115/250", "percentage": "46.00%", "elapsed_time": "29m 46s", "remaining_time": "34m 56s"} +{"loss": 0.31541901, "grad_norm": 0.54679229, "learning_rate": 5.759e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064713, "rewards/chosen": 17.5, "rewards/rejected": 1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -300.0, "logps/chosen": -282.0, "logits/rejected": -0.47070312, "logits/chosen": -0.42773438, "nll_loss": 0.27148438, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "30m 49s", "remaining_time": "33m 23s"} +{"eval_loss": 0.40161133, "eval_runtime": 7.0901, "eval_samples_per_second": 0.564, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.875, "eval_rewards/rejected": -3.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.75, "eval_logps/rejected": -274.0, "eval_logps/chosen": -4.4375, "eval_logits/rejected": -0.16113281, "eval_logits/chosen": -1.015625, "eval_nll_loss": 0.21191406, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "30m 56s", "remaining_time": "33m 31s"} +{"loss": 0.31518784, "grad_norm": 0.88296252, "learning_rate": 5.43e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064742, "rewards/chosen": 13.4375, "rewards/rejected": -0.35351562, "rewards/accuracies": 1.0, "rewards/margins": 13.8125, "logps/rejected": -920.0, "logps/chosen": -17.375, "logits/rejected": -0.28710938, "logits/chosen": -0.96875, "nll_loss": 0.20605469, "epoch": 2.5, "global_step/max_steps": "125/250", "percentage": "50.00%", "elapsed_time": "32m 6s", "remaining_time": "32m 6s"} +{"loss": 0.30629501, "grad_norm": 2.12189882, "learning_rate": 5.099e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065152, "rewards/chosen": 20.0, "rewards/rejected": 0.20898438, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -255.0, "logps/chosen": -302.0, "logits/rejected": -0.58984375, "logits/chosen": -0.47851562, "nll_loss": 0.29882812, "epoch": 2.6, "global_step/max_steps": "130/250", "percentage": "52.00%", "elapsed_time": "33m 10s", "remaining_time": "30m 37s"} +{"loss": 0.29226942, "grad_norm": 0.9713606, "learning_rate": 4.768e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065246, "rewards/chosen": 13.125, "rewards/rejected": -2.203125, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -560.0, "logps/chosen": -118.0, "logits/rejected": -0.22460938, "logits/chosen": -0.57421875, "nll_loss": 0.11132812, "epoch": 2.7, "global_step/max_steps": "135/250", "percentage": "54.00%", "elapsed_time": "34m 24s", "remaining_time": "29m 18s"} +{"loss": 0.34468336, "grad_norm": 0.4220706, "learning_rate": 4.438e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065537, "rewards/chosen": 18.875, "rewards/rejected": 0.06982422, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/rejected": -604.0, "logps/chosen": -274.0, "logits/rejected": -0.49609375, "logits/chosen": -0.45703125, "nll_loss": 0.49023438, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "35m 31s", "remaining_time": "27m 54s"} +{"eval_loss": 0.42333984, "eval_runtime": 7.0582, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": -4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.125, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.28125, "eval_logits/rejected": -0.15527344, "eval_logits/chosen": -1.03125, "eval_nll_loss": 0.29882812, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "35m 38s", "remaining_time": "28m 0s"} +{"loss": 0.35186348, "grad_norm": 1.65807026, "learning_rate": 4.11e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06474, "rewards/chosen": 20.25, "rewards/rejected": 1.515625, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/rejected": -572.0, "logps/chosen": -440.0, "logits/rejected": -0.359375, "logits/chosen": -0.12255859, "nll_loss": 0.46875, "epoch": 2.9, "global_step/max_steps": "145/250", "percentage": "58.00%", "elapsed_time": "37m 15s", "remaining_time": "26m 58s"} +{"loss": 0.27125919, "grad_norm": 0.44673005, "learning_rate": 3.786e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064444, "rewards/chosen": 23.25, "rewards/rejected": 2.125, "rewards/accuracies": 1.0, "rewards/margins": 21.125, "logps/rejected": -248.0, "logps/chosen": -318.0, "logits/rejected": -0.40039062, "logits/chosen": 0.02990723, "nll_loss": 0.29296875, "epoch": 3.0, "global_step/max_steps": "150/250", "percentage": "60.00%", "elapsed_time": "38m 42s", "remaining_time": "25m 48s"} +{"loss": 0.31831894, "grad_norm": 0.34314195, "learning_rate": 3.468e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064601, "rewards/chosen": 21.875, "rewards/rejected": 0.0246582, "rewards/accuracies": 1.0, "rewards/margins": 21.875, "logps/rejected": -536.0, "logps/chosen": -436.0, "logits/rejected": -0.44335938, "logits/chosen": -0.37304688, "nll_loss": 0.27539062, "epoch": 3.1, "global_step/max_steps": "155/250", "percentage": "62.00%", "elapsed_time": "39m 54s", "remaining_time": "24m 27s"} +{"loss": 0.271807, "grad_norm": 0.32759752, "learning_rate": 3.156e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064486, "rewards/chosen": 23.5, "rewards/rejected": 3.484375, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -346.0, "logps/chosen": -324.0, "logits/rejected": -0.70703125, "logits/chosen": -0.39453125, "nll_loss": 0.3046875, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "41m 16s", "remaining_time": "23m 13s"} +{"eval_loss": 0.42871094, "eval_runtime": 7.0148, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.143, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": -4.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.3125, "eval_logps/rejected": -282.0, "eval_logps/chosen": -6.625, "eval_logits/rejected": -0.1953125, "eval_logits/chosen": -1.0, "eval_nll_loss": 0.31640625, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "41m 23s", "remaining_time": "23m 16s"} +{"loss": 0.26042366, "grad_norm": 0.63663794, "learning_rate": 2.852e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064349, "rewards/chosen": 14.875, "rewards/rejected": -4.53125, "rewards/accuracies": 1.0, "rewards/margins": 19.375, "logps/rejected": -456.0, "logps/chosen": -118.5, "logits/rejected": -0.21875, "logits/chosen": -0.6796875, "nll_loss": 0.10009766, "epoch": 3.3, "global_step/max_steps": "165/250", "percentage": "66.00%", "elapsed_time": "42m 39s", "remaining_time": "21m 58s"} +{"loss": 0.22490354, "grad_norm": 0.44783988, "learning_rate": 2.558e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064149, "rewards/chosen": 27.25, "rewards/rejected": 1.8203125, "rewards/accuracies": 1.0, "rewards/margins": 25.5, "logps/rejected": -169.0, "logps/chosen": -458.0, "logits/rejected": -0.62109375, "logits/chosen": 0.22265625, "nll_loss": 0.30859375, "epoch": 3.4, "global_step/max_steps": "170/250", "percentage": "68.00%", "elapsed_time": "44m 5s", "remaining_time": "20m 44s"} +{"loss": 0.24174848, "grad_norm": 0.43618775, "learning_rate": 2.274e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064217, "rewards/chosen": 25.625, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 20.625, "logps/rejected": -452.0, "logps/chosen": -254.0, "logits/rejected": 0.06494141, "logits/chosen": 0.5625, "nll_loss": 0.33789062, "epoch": 3.5, "global_step/max_steps": "175/250", "percentage": "70.00%", "elapsed_time": "45m 20s", "remaining_time": "19m 25s"} +{"loss": 0.30023603, "grad_norm": 0.32880947, "learning_rate": 2.002e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064542, "rewards/chosen": 26.25, "rewards/rejected": 0.8359375, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -251.0, "logps/chosen": -388.0, "logits/rejected": -0.640625, "logits/chosen": -0.25195312, "nll_loss": 0.43359375, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "46m 24s", "remaining_time": "18m 2s"} +{"eval_loss": 0.44628906, "eval_runtime": 7.0704, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -4.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.375, "eval_logps/rejected": -284.0, "eval_logps/chosen": -8.125, "eval_logits/rejected": -0.1953125, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.38671875, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "46m 31s", "remaining_time": "18m 5s"} +{"loss": 0.25345821, "grad_norm": 0.49681405, "learning_rate": 1.744e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064516, "rewards/chosen": 25.0, "rewards/rejected": 2.671875, "rewards/accuracies": 1.0, "rewards/margins": 22.375, "logps/rejected": -195.0, "logps/chosen": -358.0, "logits/rejected": -0.45703125, "logits/chosen": -0.09375, "nll_loss": 0.41210938, "epoch": 3.7, "global_step/max_steps": "185/250", "percentage": "74.00%", "elapsed_time": "47m 42s", "remaining_time": "16m 45s"} +{"loss": 0.25413113, "grad_norm": 0.50638891, "learning_rate": 1.5e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064791, "rewards/chosen": 24.875, "rewards/rejected": -0.40429688, "rewards/accuracies": 1.0, "rewards/margins": 25.25, "logps/rejected": -544.0, "logps/chosen": -183.0, "logits/rejected": -0.2734375, "logits/chosen": -0.60546875, "nll_loss": 0.25, "epoch": 3.8, "global_step/max_steps": "190/250", "percentage": "76.00%", "elapsed_time": "48m 47s", "remaining_time": "15m 24s"} +{"loss": 0.21763363, "grad_norm": 0.52692823, "learning_rate": 1.271e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06463, "rewards/chosen": 22.5, "rewards/rejected": 0.05151367, "rewards/accuracies": 1.0, "rewards/margins": 22.5, "logps/rejected": -556.0, "logps/chosen": -324.0, "logits/rejected": -0.10546875, "logits/chosen": 0.43554688, "nll_loss": 0.26171875, "epoch": 3.9, "global_step/max_steps": "195/250", "percentage": "78.00%", "elapsed_time": "50m 12s", "remaining_time": "14m 9s"} +{"loss": 0.24068713, "grad_norm": 0.30158894, "learning_rate": 1.059e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064458, "rewards/chosen": 21.875, "rewards/rejected": 0.296875, "rewards/accuracies": 1.0, "rewards/margins": 21.625, "logps/rejected": -568.0, "logps/chosen": -235.0, "logits/rejected": -0.05688477, "logits/chosen": -0.06542969, "nll_loss": 0.25390625, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "51m 38s", "remaining_time": "12m 54s"} +{"eval_loss": 0.4453125, "eval_runtime": 7.0218, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -4.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.1875, "eval_logps/rejected": -282.0, "eval_logps/chosen": -7.90625, "eval_logits/rejected": -0.19921875, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.37695312, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "51m 45s", "remaining_time": "12m 56s"} +{"loss": 0.25838389, "grad_norm": 0.51971115, "learning_rate": 8.63e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064077, "rewards/chosen": 23.125, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -556.0, "logps/chosen": -270.0, "logits/rejected": -0.13183594, "logits/chosen": -0.17578125, "nll_loss": 0.23828125, "epoch": 4.1, "global_step/max_steps": "205/250", "percentage": "82.00%", "elapsed_time": "53m 14s", "remaining_time": "11m 41s"} +{"loss": 0.20056458, "grad_norm": 0.62362181, "learning_rate": 6.87e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064191, "rewards/chosen": 25.75, "rewards/rejected": 1.8515625, "rewards/accuracies": 1.0, "rewards/margins": 24.0, "logps/rejected": -220.0, "logps/chosen": -184.0, "logits/rejected": -0.48828125, "logits/chosen": -0.22949219, "nll_loss": 0.20898438, "epoch": 4.2, "global_step/max_steps": "210/250", "percentage": "84.00%", "elapsed_time": "54m 26s", "remaining_time": "10m 22s"} +{"loss": 0.23559434, "grad_norm": 0.47824031, "learning_rate": 5.29e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064213, "rewards/chosen": 18.75, "rewards/rejected": -2.890625, "rewards/accuracies": 1.0, "rewards/margins": 21.625, "logps/rejected": -972.0, "logps/chosen": -117.0, "logits/rejected": 0.16503906, "logits/chosen": -0.49414062, "nll_loss": 0.11425781, "epoch": 4.3, "global_step/max_steps": "215/250", "percentage": "86.00%", "elapsed_time": "55m 43s", "remaining_time": "9m 4s"} +{"loss": 0.21706581, "grad_norm": 0.38854979, "learning_rate": 3.9e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064109, "rewards/chosen": 15.75, "rewards/rejected": -3.921875, "rewards/accuracies": 1.0, "rewards/margins": 19.625, "logps/rejected": -864.0, "logps/chosen": -70.5, "logits/rejected": -0.08203125, "logits/chosen": -0.5625, "nll_loss": 0.05297852, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "57m 6s", "remaining_time": "7m 47s"} +{"eval_loss": 0.44921875, "eval_runtime": 7.0379, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.625, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.25, "eval_logits/rejected": -0.20703125, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.39257812, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "57m 13s", "remaining_time": "7m 48s"} +{"loss": 0.23574183, "grad_norm": 0.38319447, "learning_rate": 2.72e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.063935, "rewards/chosen": 32.25, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 27.125, "logps/rejected": -201.0, "logps/chosen": -470.0, "logits/rejected": -0.625, "logits/chosen": -0.12890625, "nll_loss": 0.48632812, "epoch": 4.5, "global_step/max_steps": "225/250", "percentage": "90.00%", "elapsed_time": "58m 34s", "remaining_time": "6m 30s"} +{"loss": 0.20713854, "grad_norm": 0.43318638, "learning_rate": 1.75e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.063909, "rewards/chosen": 25.0, "rewards/rejected": 1.828125, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -676.0, "logps/chosen": -208.0, "logits/rejected": -0.5234375, "logits/chosen": 0.02807617, "nll_loss": 0.26171875, "epoch": 4.6, "global_step/max_steps": "230/250", "percentage": "92.00%", "elapsed_time": "59m 54s", "remaining_time": "5m 12s"} +{"loss": 0.17200484, "grad_norm": 0.72733502, "learning_rate": 9.9e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06399, "rewards/chosen": 18.75, "rewards/rejected": -2.609375, "rewards/accuracies": 1.0, "rewards/margins": 21.375, "logps/rejected": -624.0, "logps/chosen": -151.0, "logits/rejected": -0.078125, "logits/chosen": -0.58984375, "nll_loss": 0.18066406, "epoch": 4.7, "global_step/max_steps": "235/250", "percentage": "94.00%", "elapsed_time": "1h 1m 7s", "remaining_time": "3m 54s"} +{"loss": 0.20636966, "grad_norm": 0.32144115, "learning_rate": 4.4e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064197, "rewards/chosen": 28.75, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 24.375, "logps/rejected": -556.0, "logps/chosen": -304.0, "logits/rejected": -0.5234375, "logits/chosen": -0.38476562, "nll_loss": 0.43945312, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "1h 2m 13s", "remaining_time": "2m 35s"} +{"eval_loss": 0.45214844, "eval_runtime": 7.0748, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.4375, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.4375, "eval_logits/rejected": -0.20703125, "eval_logits/chosen": -0.98046875, "eval_nll_loss": 0.40234375, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "1h 2m 20s", "remaining_time": "2m 35s"} +{"loss": 0.25318151, "grad_norm": 0.44669881, "learning_rate": 1.1e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06414, "rewards/chosen": 25.75, "rewards/rejected": 2.328125, "rewards/accuracies": 1.0, "rewards/margins": 23.5, "logps/rejected": -478.0, "logps/chosen": -410.0, "logits/rejected": -0.5390625, "logits/chosen": -0.22070312, "nll_loss": 0.41015625, "epoch": 4.9, "global_step/max_steps": "245/250", "percentage": "98.00%", "elapsed_time": "1h 3m 35s", "remaining_time": "1m 17s"} +{"loss": 0.28711078, "grad_norm": 0.31873953, "learning_rate": 0.0, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064317, "rewards/chosen": 24.125, "rewards/rejected": 0.26171875, "rewards/accuracies": 1.0, "rewards/margins": 23.875, "logps/rejected": -420.0, "logps/chosen": -270.0, "logits/rejected": -0.58984375, "logits/chosen": -0.4296875, "nll_loss": 0.29882812, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 4m 42s", "remaining_time": "0s"} +{"eval_loss": 0.45166016, "eval_runtime": 7.0019, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.143, "eval_rewards/chosen": 9.4375, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.4375, "eval_logits/rejected": -0.20800781, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.40234375, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 4m 49s", "remaining_time": "0s"} +{"train_runtime": 3893.34, "train_samples_per_second": 0.507, "train_steps_per_second": 0.064, "total_flos": 1365779831848960.0, "train_loss": 0.44612469, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "1h 4m 53s", "remaining_time": "0s"} +{"train_dataset": "1140.665823±482.948067, min=300.000000, max=4026.000000, size=395", "val_dataset": "1141.500000±539.854379, min=622.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 70657.2534M Params (103.5469M Trainable [0.1465%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-250", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/checkpoint-120", "best_metric": 0.40161133, "global_step": 250, "log_history": [{"loss": 2.4296875, "grad_norm": 13.5144849172613, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 28.09, "train_speed(iter/s)": 0.049656, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -156.0, "logps/chosen": -490.0, "logits/rejected": -0.50390625, "logits/chosen": -0.494140625, "nll_loss": 0.50390625, "epoch": 0.02, "step": 1}, {"loss": 2.275634765625, "grad_norm": 13.984602895852001, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 31.11, "train_speed(iter/s)": 0.070981, "rewards/chosen": -0.2001953125, "rewards/rejected": -0.01251220703125, "rewards/accuracies": 0.0, "rewards/margins": -0.1875, "logps/rejected": -234.0, "logps/chosen": -572.0, "logits/rejected": 0.10693359375, "logits/chosen": 1.5625, "nll_loss": 0.765625, "epoch": 0.1, "step": 5}, {"loss": 1.98564453125, "grad_norm": 9.879185725004177, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 34.78, "train_speed(iter/s)": 0.065781, "rewards/chosen": 0.55078125, "rewards/rejected": 0.169921875, "rewards/accuracies": 0.6000000238418579, "rewards/margins": 0.37890625, "logps/rejected": -468.0, "logps/chosen": -488.0, "logits/rejected": -0.40625, "logits/chosen": -0.408203125, "nll_loss": 1.2578125, "epoch": 0.2, "step": 10}, {"loss": 1.342333984375, "grad_norm": 6.6662069729619855, "learning_rate": 9.998242976313776e-05, "memory(GiB)": 61.22, "train_speed(iter/s)": 0.061185, "rewards/chosen": 3.90625, "rewards/rejected": 1.9375, "rewards/accuracies": 1.0, "rewards/margins": 1.9609375, "logps/rejected": -928.0, "logps/chosen": -462.0, "logits/rejected": 0.05078125, "logits/chosen": -0.061279296875, "nll_loss": 0.9140625, "epoch": 0.3, "step": 15}, {"loss": 1.189111328125, "grad_norm": 1.470369288221021, "learning_rate": 9.97849063861667e-05, "memory(GiB)": 61.22, "train_speed(iter/s)": 0.062635, "rewards/chosen": 7.03125, "rewards/rejected": 4.59375, "rewards/accuracies": 0.800000011920929, "rewards/margins": 2.4375, "logps/rejected": -290.0, "logps/chosen": -300.0, "logits/rejected": -0.453125, "logits/chosen": -0.51953125, "nll_loss": 0.482421875, "epoch": 0.4, "step": 20}, {"eval_loss": 0.490234375, "eval_runtime": 7.0921, "eval_samples_per_second": 0.564, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": 5.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.9375, "eval_logps/rejected": -178.0, "eval_logps/chosen": -7.125, "eval_logits/rejected": -0.462890625, "eval_logits/chosen": -0.8046875, "eval_nll_loss": 0.33984375, "epoch": 0.4, "step": 20}, {"loss": 0.6099853515625, "grad_norm": 0.7360842799133988, "learning_rate": 9.936876709681668e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.061887, "rewards/chosen": 11.125, "rewards/rejected": 3.078125, "rewards/accuracies": 1.0, "rewards/margins": 8.0625, "logps/rejected": -372.0, "logps/chosen": -254.0, "logits/rejected": -0.376953125, "logits/chosen": -0.380859375, "nll_loss": 0.4453125, "epoch": 0.5, "step": 25}, {"loss": 0.522998046875, "grad_norm": 0.6031711214032629, "learning_rate": 9.873583924954152e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.063962, "rewards/chosen": 10.8125, "rewards/rejected": 1.234375, "rewards/accuracies": 1.0, "rewards/margins": 9.5625, "logps/rejected": -584.0, "logps/chosen": -314.0, "logits/rejected": -0.7109375, "logits/chosen": -0.78515625, "nll_loss": 0.6328125, "epoch": 0.6, "step": 30}, {"loss": 0.52447509765625, "grad_norm": 1.192561076964146, "learning_rate": 9.788890216258939e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.065833, "rewards/chosen": 15.125, "rewards/rejected": 3.953125, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -201.0, "logps/chosen": -420.0, "logits/rejected": -0.58203125, "logits/chosen": -0.263671875, "nll_loss": 0.68359375, "epoch": 0.7, "step": 35}, {"loss": 0.461572265625, "grad_norm": 0.7003125541995815, "learning_rate": 9.68316749134364e-05, "memory(GiB)": 65.14, "train_speed(iter/s)": 0.064604, "rewards/chosen": 12.625, "rewards/rejected": 2.484375, "rewards/accuracies": 1.0, "rewards/margins": 10.1875, "logps/rejected": -342.0, "logps/chosen": -174.0, "logits/rejected": -0.51171875, "logits/chosen": -0.6875, "nll_loss": 0.52734375, "epoch": 0.8, "step": 40}, {"eval_loss": 0.44287109375, "eval_runtime": 7.0822, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": 0.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.125, "eval_logps/rejected": -230.0, "eval_logps/chosen": -6.5625, "eval_logits/rejected": -0.326171875, "eval_logits/chosen": -0.94921875, "eval_nll_loss": 0.3125, "epoch": 0.8, "step": 40}, {"loss": 0.45257568359375, "grad_norm": 0.6240116852703075, "learning_rate": 9.55688000075414e-05, "memory(GiB)": 72.04, "train_speed(iter/s)": 0.063816, "rewards/chosen": 18.25, "rewards/rejected": 3.203125, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -416.0, "logps/chosen": -378.0, "logits/rejected": -0.73828125, "logits/chosen": -0.51171875, "nll_loss": 0.4375, "epoch": 0.9, "step": 45}, {"loss": 0.44512939453125, "grad_norm": 0.9089784106511326, "learning_rate": 9.410582299213573e-05, "memory(GiB)": 72.04, "train_speed(iter/s)": 0.064594, "rewards/chosen": 13.0625, "rewards/rejected": 1.1171875, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -1000.0, "logps/chosen": -386.0, "logits/rejected": -0.01324462890625, "logits/chosen": -0.2177734375, "nll_loss": 0.390625, "epoch": 1.0, "step": 50}, {"loss": 0.4583740234375, "grad_norm": 0.5342183245129553, "learning_rate": 9.244916810456821e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064662, "rewards/chosen": 19.0, "rewards/rejected": 1.5078125, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -604.0, "logps/chosen": -314.0, "logits/rejected": -0.4296875, "logits/chosen": -0.08154296875, "nll_loss": 0.4765625, "epoch": 1.1, "step": 55}, {"loss": 0.415283203125, "grad_norm": 0.3307989428519516, "learning_rate": 9.060611006213832e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065074, "rewards/chosen": 13.8125, "rewards/rejected": -1.1875, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -540.0, "logps/chosen": -177.0, "logits/rejected": -0.2734375, "logits/chosen": -0.275390625, "nll_loss": 0.32421875, "epoch": 1.2, "step": 60}, {"eval_loss": 0.421875, "eval_runtime": 7.0588, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.75, "eval_rewards/rejected": -3.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.25, "eval_logps/rejected": -270.0, "eval_logps/chosen": -5.5, "eval_logits/rejected": -0.138671875, "eval_logits/chosen": -0.8828125, "eval_nll_loss": 0.26171875, "epoch": 1.2, "step": 60}, {"loss": 0.46671142578125, "grad_norm": 0.6235236115981455, "learning_rate": 8.858474211729469e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064345, "rewards/chosen": 13.6875, "rewards/rejected": -1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -708.0, "logps/chosen": -250.0, "logits/rejected": 0.048583984375, "logits/chosen": -0.2578125, "nll_loss": 0.470703125, "epoch": 1.3, "step": 65}, {"loss": 0.3884857177734375, "grad_norm": 0.4510421974711146, "learning_rate": 8.639394051847472e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065067, "rewards/chosen": 11.25, "rewards/rejected": -0.7578125, "rewards/accuracies": 1.0, "rewards/margins": 12.0, "logps/rejected": -876.0, "logps/chosen": -48.0, "logits/rejected": -0.09765625, "logits/chosen": -0.765625, "nll_loss": 0.50390625, "epoch": 1.4, "step": 70}, {"loss": 0.4062408447265625, "grad_norm": 0.46094113890076227, "learning_rate": 8.404332553264547e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065208, "rewards/chosen": 17.75, "rewards/rejected": 0.51953125, "rewards/accuracies": 1.0, "rewards/margins": 17.25, "logps/rejected": -376.0, "logps/chosen": -344.0, "logits/rejected": -0.388671875, "logits/chosen": -0.34375, "nll_loss": 0.353515625, "epoch": 1.5, "step": 75}, {"loss": 0.427960205078125, "grad_norm": 0.3027474591177724, "learning_rate": 8.154321920070414e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065989, "rewards/chosen": 22.125, "rewards/rejected": 4.75, "rewards/accuracies": 1.0, "rewards/margins": 17.5, "logps/rejected": -107.0, "logps/chosen": -426.0, "logits/rejected": -0.458984375, "logits/chosen": 0.28515625, "nll_loss": 0.5078125, "epoch": 1.6, "step": 80}, {"eval_loss": 0.4228515625, "eval_runtime": 7.0664, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.6875, "eval_rewards/rejected": -3.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.75, "eval_logps/rejected": -266.0, "eval_logps/chosen": -5.96875, "eval_logits/rejected": -0.11328125, "eval_logits/chosen": -0.859375, "eval_nll_loss": 0.28515625, "epoch": 1.6, "step": 80}, {"loss": 0.52896728515625, "grad_norm": 0.606815621196192, "learning_rate": 7.890460001124242e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065427, "rewards/chosen": 14.5625, "rewards/rejected": 0.48828125, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/rejected": -644.0, "logps/chosen": -217.0, "logits/rejected": -0.314453125, "logits/chosen": -0.365234375, "nll_loss": 0.3125, "epoch": 1.7, "step": 85}, {"loss": 0.4052978515625, "grad_norm": 0.603209904087939, "learning_rate": 7.613905469171246e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065028, "rewards/chosen": 14.9375, "rewards/rejected": 0.76953125, "rewards/accuracies": 1.0, "rewards/margins": 14.1875, "logps/rejected": -446.0, "logps/chosen": -236.0, "logits/rejected": -0.41796875, "logits/chosen": -0.2021484375, "nll_loss": 0.45703125, "epoch": 1.8, "step": 90}, {"loss": 0.420556640625, "grad_norm": 0.39872701490711776, "learning_rate": 7.325872732868869e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064964, "rewards/chosen": 17.75, "rewards/rejected": 0.453125, "rewards/accuracies": 1.0, "rewards/margins": 17.25, "logps/rejected": -472.0, "logps/chosen": -286.0, "logits/rejected": -0.09130859375, "logits/chosen": -0.265625, "nll_loss": 0.376953125, "epoch": 1.9, "step": 95}, {"loss": 0.3860954284667969, "grad_norm": 0.4636763750262995, "learning_rate": 7.027626604064969e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064753, "rewards/chosen": 14.4375, "rewards/rejected": 1.0078125, "rewards/accuracies": 1.0, "rewards/margins": 13.4375, "logps/rejected": -324.0, "logps/chosen": -214.0, "logits/rejected": -0.2099609375, "logits/chosen": -0.376953125, "nll_loss": 0.3515625, "epoch": 2.0, "step": 100}, {"eval_loss": 0.412109375, "eval_runtime": 6.9494, "eval_samples_per_second": 0.576, "eval_steps_per_second": 0.144, "eval_rewards/chosen": 9.8125, "eval_rewards/rejected": -3.296875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.125, "eval_logps/rejected": -268.0, "eval_logps/chosen": -5.125, "eval_logits/rejected": -0.189453125, "eval_logits/chosen": -1.0078125, "eval_nll_loss": 0.244140625, "epoch": 2.0, "step": 100}, {"loss": 0.370050048828125, "grad_norm": 0.3992315577108378, "learning_rate": 6.720476743745072e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064078, "rewards/chosen": 20.875, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 16.5, "logps/rejected": -544.0, "logps/chosen": -336.0, "logits/rejected": -0.62890625, "logits/chosen": -0.36328125, "nll_loss": 0.439453125, "epoch": 2.1, "step": 105}, {"loss": 0.3550140380859375, "grad_norm": 0.5929485058503616, "learning_rate": 6.405771911037699e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064054, "rewards/chosen": 17.375, "rewards/rejected": 1.5234375, "rewards/accuracies": 1.0, "rewards/margins": 15.8125, "logps/rejected": -864.0, "logps/chosen": -304.0, "logits/rejected": 0.003997802734375, "logits/chosen": -0.2275390625, "nll_loss": 0.328125, "epoch": 2.2, "step": 110}, {"loss": 0.38344573974609375, "grad_norm": 0.3698356285503532, "learning_rate": 6.08489404053159e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064216, "rewards/chosen": 19.25, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 15.0, "logps/rejected": -584.0, "logps/chosen": -356.0, "logits/rejected": -0.2099609375, "logits/chosen": -0.142578125, "nll_loss": 0.36328125, "epoch": 2.3, "step": 115}, {"loss": 0.31541900634765624, "grad_norm": 0.5467922855501066, "learning_rate": 5.7592521739125726e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064713, "rewards/chosen": 17.5, "rewards/rejected": 1.3515625, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -300.0, "logps/chosen": -282.0, "logits/rejected": -0.470703125, "logits/chosen": -0.427734375, "nll_loss": 0.271484375, "epoch": 2.4, "step": 120}, {"eval_loss": 0.401611328125, "eval_runtime": 7.0901, "eval_samples_per_second": 0.564, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.875, "eval_rewards/rejected": -3.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.75, "eval_logps/rejected": -274.0, "eval_logps/chosen": -4.4375, "eval_logits/rejected": -0.1611328125, "eval_logits/chosen": -1.015625, "eval_nll_loss": 0.2119140625, "epoch": 2.4, "step": 120}, {"loss": 0.3151878356933594, "grad_norm": 0.8829625223112105, "learning_rate": 5.430276272567485e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064742, "rewards/chosen": 13.4375, "rewards/rejected": -0.353515625, "rewards/accuracies": 1.0, "rewards/margins": 13.8125, "logps/rejected": -920.0, "logps/chosen": -17.375, "logits/rejected": -0.287109375, "logits/chosen": -0.96875, "nll_loss": 0.2060546875, "epoch": 2.5, "step": 125}, {"loss": 0.30629501342773435, "grad_norm": 2.121898818349653, "learning_rate": 5.0994109383253506e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065152, "rewards/chosen": 20.0, "rewards/rejected": 0.208984375, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -255.0, "logps/chosen": -302.0, "logits/rejected": -0.58984375, "logits/chosen": -0.478515625, "nll_loss": 0.298828125, "epoch": 2.6, "step": 130}, {"loss": 0.2922694206237793, "grad_norm": 0.9713606046745735, "learning_rate": 4.768109069909307e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065246, "rewards/chosen": 13.125, "rewards/rejected": -2.203125, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -560.0, "logps/chosen": -118.0, "logits/rejected": -0.224609375, "logits/chosen": -0.57421875, "nll_loss": 0.111328125, "epoch": 2.7, "step": 135}, {"loss": 0.3446833610534668, "grad_norm": 0.42207060142345165, "learning_rate": 4.4378254829551396e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.065537, "rewards/chosen": 18.875, "rewards/rejected": 0.06982421875, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/rejected": -604.0, "logps/chosen": -274.0, "logits/rejected": -0.49609375, "logits/chosen": -0.45703125, "nll_loss": 0.490234375, "epoch": 2.8, "step": 140}, {"eval_loss": 0.42333984375, "eval_runtime": 7.0582, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": -4.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.125, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.28125, "eval_logits/rejected": -0.1552734375, "eval_logits/chosen": -1.03125, "eval_nll_loss": 0.298828125, "epoch": 2.8, "step": 140}, {"loss": 0.3518634796142578, "grad_norm": 1.6580702623605046, "learning_rate": 4.11001052161225e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06474, "rewards/chosen": 20.25, "rewards/rejected": 1.515625, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/rejected": -572.0, "logps/chosen": -440.0, "logits/rejected": -0.359375, "logits/chosen": -0.12255859375, "nll_loss": 0.46875, "epoch": 2.9, "step": 145}, {"loss": 0.2712591886520386, "grad_norm": 0.4467300458184829, "learning_rate": 3.786103689779861e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064444, "rewards/chosen": 23.25, "rewards/rejected": 2.125, "rewards/accuracies": 1.0, "rewards/margins": 21.125, "logps/rejected": -248.0, "logps/chosen": -318.0, "logits/rejected": -0.400390625, "logits/chosen": 0.0299072265625, "nll_loss": 0.29296875, "epoch": 3.0, "step": 150}, {"loss": 0.31831893920898435, "grad_norm": 0.3431419537396095, "learning_rate": 3.467527329945026e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064601, "rewards/chosen": 21.875, "rewards/rejected": 0.024658203125, "rewards/accuracies": 1.0, "rewards/margins": 21.875, "logps/rejected": -536.0, "logps/chosen": -436.0, "logits/rejected": -0.443359375, "logits/chosen": -0.373046875, "nll_loss": 0.275390625, "epoch": 3.1, "step": 155}, {"loss": 0.2718070030212402, "grad_norm": 0.32759751569486095, "learning_rate": 3.1556803773799614e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064486, "rewards/chosen": 23.5, "rewards/rejected": 3.484375, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -346.0, "logps/chosen": -324.0, "logits/rejected": -0.70703125, "logits/chosen": -0.39453125, "nll_loss": 0.3046875, "epoch": 3.2, "step": 160}, {"eval_loss": 0.4287109375, "eval_runtime": 7.0148, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.143, "eval_rewards/chosen": 9.625, "eval_rewards/rejected": -4.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.3125, "eval_logps/rejected": -282.0, "eval_logps/chosen": -6.625, "eval_logits/rejected": -0.1953125, "eval_logits/chosen": -1.0, "eval_nll_loss": 0.31640625, "epoch": 3.2, "step": 160}, {"loss": 0.2604236602783203, "grad_norm": 0.6366379388596981, "learning_rate": 2.8519322171253602e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064349, "rewards/chosen": 14.875, "rewards/rejected": -4.53125, "rewards/accuracies": 1.0, "rewards/margins": 19.375, "logps/rejected": -456.0, "logps/chosen": -118.5, "logits/rejected": -0.21875, "logits/chosen": -0.6796875, "nll_loss": 0.10009765625, "epoch": 3.3, "step": 165}, {"loss": 0.22490353584289552, "grad_norm": 0.4478398792618674, "learning_rate": 2.5576166707349385e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064149, "rewards/chosen": 27.25, "rewards/rejected": 1.8203125, "rewards/accuracies": 1.0, "rewards/margins": 25.5, "logps/rejected": -169.0, "logps/chosen": -458.0, "logits/rejected": -0.62109375, "logits/chosen": 0.22265625, "nll_loss": 0.30859375, "epoch": 3.4, "step": 170}, {"loss": 0.24174847602844238, "grad_norm": 0.43618775126579723, "learning_rate": 2.2740261391866637e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064217, "rewards/chosen": 25.625, "rewards/rejected": 4.9375, "rewards/accuracies": 1.0, "rewards/margins": 20.625, "logps/rejected": -452.0, "logps/chosen": -254.0, "logits/rejected": 0.06494140625, "logits/chosen": 0.5625, "nll_loss": 0.337890625, "epoch": 3.5, "step": 175}, {"loss": 0.30023603439331054, "grad_norm": 0.328809465093868, "learning_rate": 2.002405927680374e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064542, "rewards/chosen": 26.25, "rewards/rejected": 0.8359375, "rewards/accuracies": 1.0, "rewards/margins": 25.375, "logps/rejected": -251.0, "logps/chosen": -388.0, "logits/rejected": -0.640625, "logits/chosen": -0.251953125, "nll_loss": 0.43359375, "epoch": 3.6, "step": 180}, {"eval_loss": 0.4462890625, "eval_runtime": 7.0704, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -4.90625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.375, "eval_logps/rejected": -284.0, "eval_logps/chosen": -8.125, "eval_logits/rejected": -0.1953125, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.38671875, "epoch": 3.6, "step": 180}, {"loss": 0.25345821380615235, "grad_norm": 0.49681405036162596, "learning_rate": 1.743948777242814e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064516, "rewards/chosen": 25.0, "rewards/rejected": 2.671875, "rewards/accuracies": 1.0, "rewards/margins": 22.375, "logps/rejected": -195.0, "logps/chosen": -358.0, "logits/rejected": -0.45703125, "logits/chosen": -0.09375, "nll_loss": 0.412109375, "epoch": 3.7, "step": 185}, {"loss": 0.2541311264038086, "grad_norm": 0.5063889084098974, "learning_rate": 1.4997896271528739e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064791, "rewards/chosen": 24.875, "rewards/rejected": -0.404296875, "rewards/accuracies": 1.0, "rewards/margins": 25.25, "logps/rejected": -544.0, "logps/chosen": -183.0, "logits/rejected": -0.2734375, "logits/chosen": -0.60546875, "nll_loss": 0.25, "epoch": 3.8, "step": 190}, {"loss": 0.21763362884521484, "grad_norm": 0.5269282280846967, "learning_rate": 1.2710006311864104e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06463, "rewards/chosen": 22.5, "rewards/rejected": 0.051513671875, "rewards/accuracies": 1.0, "rewards/margins": 22.5, "logps/rejected": -556.0, "logps/chosen": -324.0, "logits/rejected": -0.10546875, "logits/chosen": 0.435546875, "nll_loss": 0.26171875, "epoch": 3.9, "step": 195}, {"loss": 0.24068713188171387, "grad_norm": 0.30158893716839064, "learning_rate": 1.0585864495652897e-05, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064458, "rewards/chosen": 21.875, "rewards/rejected": 0.296875, "rewards/accuracies": 1.0, "rewards/margins": 21.625, "logps/rejected": -568.0, "logps/chosen": -235.0, "logits/rejected": -0.056884765625, "logits/chosen": -0.0654296875, "nll_loss": 0.25390625, "epoch": 4.0, "step": 200}, {"eval_loss": 0.4453125, "eval_runtime": 7.0218, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -4.6875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.1875, "eval_logps/rejected": -282.0, "eval_logps/chosen": -7.90625, "eval_logits/rejected": -0.19921875, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.376953125, "epoch": 4.0, "step": 200}, {"loss": 0.2583838939666748, "grad_norm": 0.5197111532124594, "learning_rate": 8.634798372847148e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064077, "rewards/chosen": 23.125, "rewards/rejected": 4.0625, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -556.0, "logps/chosen": -270.0, "logits/rejected": -0.1318359375, "logits/chosen": -0.17578125, "nll_loss": 0.23828125, "epoch": 4.1, "step": 205}, {"loss": 0.2005645751953125, "grad_norm": 0.6236218136792746, "learning_rate": 6.865375481914016e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064191, "rewards/chosen": 25.75, "rewards/rejected": 1.8515625, "rewards/accuracies": 1.0, "rewards/margins": 24.0, "logps/rejected": -220.0, "logps/chosen": -184.0, "logits/rejected": -0.48828125, "logits/chosen": -0.2294921875, "nll_loss": 0.208984375, "epoch": 4.2, "step": 210}, {"loss": 0.23559434413909913, "grad_norm": 0.47824031212563534, "learning_rate": 5.285365727986707e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064213, "rewards/chosen": 18.75, "rewards/rejected": -2.890625, "rewards/accuracies": 1.0, "rewards/margins": 21.625, "logps/rejected": -972.0, "logps/chosen": -117.0, "logits/rejected": 0.1650390625, "logits/chosen": -0.494140625, "nll_loss": 0.1142578125, "epoch": 4.3, "step": 215}, {"loss": 0.21706581115722656, "grad_norm": 0.3885497874914557, "learning_rate": 3.901707263589671e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064109, "rewards/chosen": 15.75, "rewards/rejected": -3.921875, "rewards/accuracies": 1.0, "rewards/margins": 19.625, "logps/rejected": -864.0, "logps/chosen": -70.5, "logits/rejected": -0.08203125, "logits/chosen": -0.5625, "nll_loss": 0.052978515625, "epoch": 4.4, "step": 220}, {"eval_loss": 0.44921875, "eval_runtime": 7.0379, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "eval_rewards/chosen": 9.5, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.625, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.25, "eval_logits/rejected": -0.20703125, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.392578125, "epoch": 4.4, "step": 220}, {"loss": 0.23574182987213135, "grad_norm": 0.38319446798926815, "learning_rate": 2.7204760217631074e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.063935, "rewards/chosen": 32.25, "rewards/rejected": 5.15625, "rewards/accuracies": 1.0, "rewards/margins": 27.125, "logps/rejected": -201.0, "logps/chosen": -470.0, "logits/rejected": -0.625, "logits/chosen": -0.12890625, "nll_loss": 0.486328125, "epoch": 4.5, "step": 225}, {"loss": 0.2071385383605957, "grad_norm": 0.4331863840312884, "learning_rate": 1.7468590353731495e-06, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.063909, "rewards/chosen": 25.0, "rewards/rejected": 1.828125, "rewards/accuracies": 1.0, "rewards/margins": 23.125, "logps/rejected": -676.0, "logps/chosen": -208.0, "logits/rejected": -0.5234375, "logits/chosen": 0.028076171875, "nll_loss": 0.26171875, "epoch": 4.6, "step": 230}, {"loss": 0.1720048427581787, "grad_norm": 0.7273350214373521, "learning_rate": 9.851316597681958e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06399, "rewards/chosen": 18.75, "rewards/rejected": -2.609375, "rewards/accuracies": 1.0, "rewards/margins": 21.375, "logps/rejected": -624.0, "logps/chosen": -151.0, "logits/rejected": -0.078125, "logits/chosen": -0.58984375, "nll_loss": 0.1806640625, "epoch": 4.7, "step": 235}, {"loss": 0.20636966228485107, "grad_norm": 0.32144115051135386, "learning_rate": 4.386387988014273e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064197, "rewards/chosen": 28.75, "rewards/rejected": 4.34375, "rewards/accuracies": 1.0, "rewards/margins": 24.375, "logps/rejected": -556.0, "logps/chosen": -304.0, "logits/rejected": -0.5234375, "logits/chosen": -0.384765625, "nll_loss": 0.439453125, "epoch": 4.8, "step": 240}, {"eval_loss": 0.4521484375, "eval_runtime": 7.0748, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.141, "eval_rewards/chosen": 9.4375, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.4375, "eval_logits/rejected": -0.20703125, "eval_logits/chosen": -0.98046875, "eval_nll_loss": 0.40234375, "epoch": 4.8, "step": 240}, {"loss": 0.25318150520324706, "grad_norm": 0.4466988097200344, "learning_rate": 1.0978021666005478e-07, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.06414, "rewards/chosen": 25.75, "rewards/rejected": 2.328125, "rewards/accuracies": 1.0, "rewards/margins": 23.5, "logps/rejected": -478.0, "logps/chosen": -410.0, "logits/rejected": -0.5390625, "logits/chosen": -0.220703125, "nll_loss": 0.41015625, "epoch": 4.9, "step": 245}, {"loss": 0.2871107816696167, "grad_norm": 0.3187395254363038, "learning_rate": 0.0, "memory(GiB)": 77.37, "train_speed(iter/s)": 0.064317, "rewards/chosen": 24.125, "rewards/rejected": 0.26171875, "rewards/accuracies": 1.0, "rewards/margins": 23.875, "logps/rejected": -420.0, "logps/chosen": -270.0, "logits/rejected": -0.58984375, "logits/chosen": -0.4296875, "nll_loss": 0.298828125, "epoch": 5.0, "step": 250}, {"eval_loss": 0.45166015625, "eval_runtime": 7.0019, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.143, "eval_rewards/chosen": 9.4375, "eval_rewards/rejected": -5.09375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -286.0, "eval_logps/chosen": -8.4375, "eval_logits/rejected": -0.2080078125, "eval_logits/chosen": -0.9765625, "eval_nll_loss": 0.40234375, "epoch": 5.0, "step": 250}, {"train_runtime": 3893.34, "train_samples_per_second": 0.507, "train_steps_per_second": 0.064, "total_flos": 1365779831848960.0, "train_loss": 0.44612468576431275, "epoch": 5.0, "step": 250}], "memory": 77.373046875} diff --git a/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs/events.out.tfevents.1737748517.kml-task-547024-record-9965643-prod-worker-0.90170.0 b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs/events.out.tfevents.1737748517.kml-task-547024-record-9965643-prod-worker-0.90170.0 new file mode 100644 index 0000000000000000000000000000000000000000..398aea5fad3ef0a7188f8a176929137257e990df --- /dev/null +++ b/output_deepseek_dpo/deepseek-r1-70b_400_0.5_dpo_4200_rank8_epoch5_what/v0-20250124-195308/runs/events.out.tfevents.1737748517.kml-task-547024-record-9965643-prod-worker-0.90170.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce07066d20815a2f3aabdcf042219a5cb4249690ffa4b6abba52ed42b70ba92 +size 61852 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..d961b2a0eb097c33eaa1502875427a3d319d7477 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/README.md b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..159a1f18e5c92845eafad7d0169a75c207bec517 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "k_proj", + "down_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5329d774de09e9140b7cd376f9ce6bc6f57c786f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc5671bda305d7c6b4512289b16ab45e531056a5ffe22f631e56a0dbb26001f +size 73911112 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/additional_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..d961b2a0eb097c33eaa1502875427a3d319d7477 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/optimizer.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..493c932c7d2203a609ababf1d87fae6a940a2cc7 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1e5b4f1e633fc706db509c8e4e444e3fa90b6a33964c6c340a17f70e7d67c0 +size 148047722 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/rng_state.pth b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1925bc8434fb7fd302419563a1811602253857d7 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9cf51c563adfd3f6bdd2e17b028aa293de9ec215b0013e60f279e324d86220 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/scheduler.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a7d8bc4abba06c30f7a7a85d5d4173984e4c9c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb96daf4c69d48b38d52e8ae266563af1b957cba209e2ff4307709a50fc6770 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/trainer_state.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f9a70c259ea8eb95d039651e3e14b565a304acc2 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_metric": 0.65625179, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-220", + "epoch": 4.808080808080808, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.3113684058189392, + "learning_rate": 7.692307692307694e-06, + "loss": 0.7323381900787354, + "memory(GiB)": 13.01, + "step": 1, + "token_acc": 0.8218189768255356, + "train_speed(iter/s)": 0.062844 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.971348226070404, + "learning_rate": 3.846153846153846e-05, + "loss": 1.1823183298110962, + "memory(GiB)": 22.91, + "step": 5, + "token_acc": 0.7330933259484907, + "train_speed(iter/s)": 0.166808 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.3070746958255768, + "learning_rate": 7.692307692307693e-05, + "loss": 0.9501662254333496, + "memory(GiB)": 29.36, + "step": 10, + "token_acc": 0.7653597716689199, + "train_speed(iter/s)": 0.214042 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.45415690541267395, + "learning_rate": 9.99816643111642e-05, + "loss": 0.8983875274658203, + "memory(GiB)": 35.88, + "step": 15, + "token_acc": 0.7695364238410596, + "train_speed(iter/s)": 0.236498 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.23825205862522125, + "learning_rate": 9.977554222133292e-05, + "loss": 0.7643344879150391, + "memory(GiB)": 35.89, + "step": 20, + "token_acc": 0.7882758359525883, + "train_speed(iter/s)": 0.25003 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.9797316789627075, + "eval_runtime": 0.6501, + "eval_samples_per_second": 6.153, + "eval_steps_per_second": 6.153, + "eval_token_acc": 0.6692913385826772, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1587868332862854, + "learning_rate": 9.934132612707632e-05, + "loss": 0.7561020851135254, + "memory(GiB)": 45.29, + "step": 25, + "token_acc": 0.7834674238838933, + "train_speed(iter/s)": 0.254419 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.1786944419145584, + "learning_rate": 9.868100580255466e-05, + "loss": 0.8542494773864746, + "memory(GiB)": 45.29, + "step": 30, + "token_acc": 0.753417087278488, + "train_speed(iter/s)": 0.260953 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.19432976841926575, + "learning_rate": 9.779760713358059e-05, + "loss": 0.7791708469390869, + "memory(GiB)": 45.29, + "step": 35, + "token_acc": 0.7836566725455615, + "train_speed(iter/s)": 0.266293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.19285419583320618, + "learning_rate": 9.669517825164434e-05, + "loss": 0.6533683776855469, + "memory(GiB)": 45.29, + "step": 40, + "token_acc": 0.8123940031205481, + "train_speed(iter/s)": 0.26967 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.7691243290901184, + "eval_runtime": 0.6812, + "eval_samples_per_second": 5.872, + "eval_steps_per_second": 5.872, + "eval_token_acc": 0.6860236220472441, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.16239717602729797, + "learning_rate": 9.537877098354786e-05, + "loss": 0.6813922882080078, + "memory(GiB)": 45.29, + "step": 45, + "token_acc": 0.793583974452025, + "train_speed(iter/s)": 0.270446 + }, + { + "epoch": 1.0, + "grad_norm": 0.13025595247745514, + "learning_rate": 9.385441770165385e-05, + "loss": 0.8526185989379883, + "memory(GiB)": 45.29, + "step": 50, + "token_acc": 0.7657785542694912, + "train_speed(iter/s)": 0.275541 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.17670190334320068, + "learning_rate": 9.212910368083245e-05, + "loss": 0.8316545486450195, + "memory(GiB)": 45.29, + "step": 55, + "token_acc": 0.7762060024132885, + "train_speed(iter/s)": 0.277057 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.17425084114074707, + "learning_rate": 9.021073508877845e-05, + "loss": 0.7688841819763184, + "memory(GiB)": 45.29, + "step": 60, + "token_acc": 0.7774478686734241, + "train_speed(iter/s)": 0.278339 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.7357966899871826, + "eval_runtime": 0.7931, + "eval_samples_per_second": 5.044, + "eval_steps_per_second": 5.044, + "eval_token_acc": 0.6919291338582677, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.21490731835365295, + "learning_rate": 8.810810275638183e-05, + "loss": 0.8001401901245118, + "memory(GiB)": 45.29, + "step": 65, + "token_acc": 0.7726751710112775, + "train_speed(iter/s)": 0.277964 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15898309648036957, + "learning_rate": 8.583084189417224e-05, + "loss": 0.5408373832702636, + "memory(GiB)": 45.29, + "step": 70, + "token_acc": 0.8380025125628141, + "train_speed(iter/s)": 0.278722 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.15890304744243622, + "learning_rate": 8.338938793943478e-05, + "loss": 0.7484941005706787, + "memory(GiB)": 45.29, + "step": 75, + "token_acc": 0.796329302952936, + "train_speed(iter/s)": 0.280163 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.2820371389389038, + "learning_rate": 8.079492873632554e-05, + "loss": 0.76522216796875, + "memory(GiB)": 45.29, + "step": 80, + "token_acc": 0.773585676913015, + "train_speed(iter/s)": 0.280971 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.7128127813339233, + "eval_runtime": 0.7446, + "eval_samples_per_second": 5.372, + "eval_steps_per_second": 5.372, + "eval_token_acc": 0.6988188976377953, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.17685988545417786, + "learning_rate": 7.805935326811912e-05, + "loss": 0.67728590965271, + "memory(GiB)": 45.29, + "step": 85, + "token_acc": 0.8014132731113863, + "train_speed(iter/s)": 0.280315 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.20143380761146545, + "learning_rate": 7.519519717652039e-05, + "loss": 0.6732349872589112, + "memory(GiB)": 45.29, + "step": 90, + "token_acc": 0.804056684575255, + "train_speed(iter/s)": 0.280753 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.14572596549987793, + "learning_rate": 7.221558531769519e-05, + "loss": 0.667899227142334, + "memory(GiB)": 45.29, + "step": 95, + "token_acc": 0.8067723818962669, + "train_speed(iter/s)": 0.28155 + }, + { + "epoch": 2.0, + "grad_norm": 0.30259954929351807, + "learning_rate": 6.91341716182545e-05, + "loss": 0.6598664283752441, + "memory(GiB)": 45.29, + "step": 100, + "token_acc": 0.8251867090022807, + "train_speed(iter/s)": 0.283616 + }, + { + "epoch": 2.0, + "eval_loss": 0.6895536184310913, + "eval_runtime": 0.5973, + "eval_samples_per_second": 6.697, + "eval_steps_per_second": 6.697, + "eval_token_acc": 0.6948818897637795, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.22523801028728485, + "learning_rate": 6.5965076506799e-05, + "loss": 0.5976018905639648, + "memory(GiB)": 45.29, + "step": 105, + "token_acc": 0.8109289125146409, + "train_speed(iter/s)": 0.283455 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17166492342948914, + "learning_rate": 6.272282220774091e-05, + "loss": 0.6538212299346924, + "memory(GiB)": 45.29, + "step": 110, + "token_acc": 0.803433422019511, + "train_speed(iter/s)": 0.2839 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.20610365271568298, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.6409964084625244, + "memory(GiB)": 45.29, + "step": 115, + "token_acc": 0.8130777268708304, + "train_speed(iter/s)": 0.284576 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.2860853672027588, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.6116076946258545, + "memory(GiB)": 45.29, + "step": 120, + "token_acc": 0.8325966178677157, + "train_speed(iter/s)": 0.285065 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.6733097434043884, + "eval_runtime": 0.6385, + "eval_samples_per_second": 6.264, + "eval_steps_per_second": 6.264, + "eval_token_acc": 0.6998031496062992, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.24673469364643097, + "learning_rate": 5.270694542927088e-05, + "loss": 0.598513412475586, + "memory(GiB)": 45.29, + "step": 125, + "token_acc": 0.8092562679963481, + "train_speed(iter/s)": 0.285334 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.21779406070709229, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.7576182365417481, + "memory(GiB)": 45.29, + "step": 130, + "token_acc": 0.7910772994877284, + "train_speed(iter/s)": 0.285892 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.233392596244812, + "learning_rate": 4.594206372362845e-05, + "loss": 0.5816587448120117, + "memory(GiB)": 45.29, + "step": 135, + "token_acc": 0.836008997627338, + "train_speed(iter/s)": 0.286263 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.21131905913352966, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.6801999568939209, + "memory(GiB)": 45.29, + "step": 140, + "token_acc": 0.7989492694138893, + "train_speed(iter/s)": 0.28699 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.674415647983551, + "eval_runtime": 0.6365, + "eval_samples_per_second": 6.284, + "eval_steps_per_second": 6.284, + "eval_token_acc": 0.6958661417322834, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.2413957566022873, + "learning_rate": 3.92514779894488e-05, + "loss": 0.7208431243896485, + "memory(GiB)": 45.29, + "step": 145, + "token_acc": 0.7931504515249617, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 3.0, + "grad_norm": 0.3715948164463043, + "learning_rate": 3.597244112544208e-05, + "loss": 0.7635963916778564, + "memory(GiB)": 45.29, + "step": 150, + "token_acc": 0.7863333044457925, + "train_speed(iter/s)": 0.288363 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.3262959420681, + "learning_rate": 3.275768486860149e-05, + "loss": 0.752556848526001, + "memory(GiB)": 45.29, + "step": 155, + "token_acc": 0.7921971532921459, + "train_speed(iter/s)": 0.288494 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.18450701236724854, + "learning_rate": 2.962194068331996e-05, + "loss": 0.608460807800293, + "memory(GiB)": 45.29, + "step": 160, + "token_acc": 0.8247789822248602, + "train_speed(iter/s)": 0.289163 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.6623361110687256, + "eval_runtime": 0.5658, + "eval_samples_per_second": 7.069, + "eval_steps_per_second": 7.069, + "eval_token_acc": 0.6958661417322834, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.3013140857219696, + "learning_rate": 2.65795779650105e-05, + "loss": 0.562100601196289, + "memory(GiB)": 45.29, + "step": 165, + "token_acc": 0.8357333974686588, + "train_speed(iter/s)": 0.288738 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.27321088314056396, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.6480295658111572, + "memory(GiB)": 45.29, + "step": 170, + "token_acc": 0.8038663373977466, + "train_speed(iter/s)": 0.288892 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.3005470633506775, + "learning_rate": 2.08302710446253e-05, + "loss": 0.6261944770812988, + "memory(GiB)": 45.29, + "step": 175, + "token_acc": 0.8233992624874288, + "train_speed(iter/s)": 0.289319 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.3772144019603729, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.5553826808929443, + "memory(GiB)": 45.29, + "step": 180, + "token_acc": 0.8288827847075246, + "train_speed(iter/s)": 0.289656 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.6648226380348206, + "eval_runtime": 0.6198, + "eval_samples_per_second": 6.454, + "eval_steps_per_second": 6.454, + "eval_token_acc": 0.6938976377952756, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.2717554569244385, + "learning_rate": 1.561502705732883e-05, + "loss": 0.6049529552459717, + "memory(GiB)": 45.29, + "step": 185, + "token_acc": 0.8187963431702451, + "train_speed(iter/s)": 0.289334 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.18251824378967285, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.6194498062133789, + "memory(GiB)": 45.29, + "step": 190, + "token_acc": 0.8140125301994185, + "train_speed(iter/s)": 0.289757 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.33976471424102783, + "learning_rate": 1.102933089792042e-05, + "loss": 0.666096544265747, + "memory(GiB)": 45.29, + "step": 195, + "token_acc": 0.8082351568879039, + "train_speed(iter/s)": 0.290079 + }, + { + "epoch": 4.0, + "grad_norm": 0.3477536737918854, + "learning_rate": 8.999294173332058e-06, + "loss": 0.5860945701599121, + "memory(GiB)": 45.29, + "step": 200, + "token_acc": 0.8207529927807731, + "train_speed(iter/s)": 0.291039 + }, + { + "epoch": 4.0, + "eval_loss": 0.6583613753318787, + "eval_runtime": 0.5726, + "eval_samples_per_second": 6.986, + "eval_steps_per_second": 6.986, + "eval_token_acc": 0.6909448818897638, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.44803500175476074, + "learning_rate": 7.157141191620548e-06, + "loss": 0.5250832557678222, + "memory(GiB)": 45.29, + "step": 205, + "token_acc": 0.8369385000963948, + "train_speed(iter/s)": 0.290757 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.22857555747032166, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.7626357078552246, + "memory(GiB)": 45.29, + "step": 210, + "token_acc": 0.7796052631578947, + "train_speed(iter/s)": 0.290909 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.2053779661655426, + "learning_rate": 4.069353111818913e-06, + "loss": 0.6494219779968262, + "memory(GiB)": 45.29, + "step": 215, + "token_acc": 0.8163639347849874, + "train_speed(iter/s)": 0.291173 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.3344842791557312, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.649469518661499, + "memory(GiB)": 45.29, + "step": 220, + "token_acc": 0.8097098023092473, + "train_speed(iter/s)": 0.291297 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.6562517881393433, + "eval_runtime": 0.5649, + "eval_samples_per_second": 7.081, + "eval_steps_per_second": 7.081, + "eval_token_acc": 0.6919291338582677, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.2932053506374359, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.5700400829315185, + "memory(GiB)": 45.29, + "step": 225, + "token_acc": 0.826302729528536, + "train_speed(iter/s)": 0.291192 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.29753655195236206, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.5672303676605225, + "memory(GiB)": 45.29, + "step": 230, + "token_acc": 0.8347479998375502, + "train_speed(iter/s)": 0.291184 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.2351677417755127, + "learning_rate": 4.577201710596612e-07, + "loss": 0.5433839797973633, + "memory(GiB)": 45.29, + "step": 235, + "token_acc": 0.8384622326504828, + "train_speed(iter/s)": 0.291437 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.2580409646034241, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.5680582523345947, + "memory(GiB)": 45.29, + "step": 240, + "token_acc": 0.829964260185847, + "train_speed(iter/s)": 0.291576 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.6566542387008667, + "eval_runtime": 0.5647, + "eval_samples_per_second": 7.084, + "eval_steps_per_second": 7.084, + "eval_token_acc": 0.6899606299212598, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.405736211013632e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/training_args.bin b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c7d6f1d9bffa041371678b1cf39933953aabd78 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f597a6ef45ff19f5839a81e570c1f418eac5afbc9704f40518398d7021c02f +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/README.md b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..159a1f18e5c92845eafad7d0169a75c207bec517 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "k_proj", + "down_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ac497ae00882c7dbd5d614252b8a25ea885ec49 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfa7a1462d8e07a46b9399b76d012f241fc8b51b3e1ec108760236b3e40d7a3 +size 73911112 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/additional_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/args.json new file mode 100644 index 0000000000000000000000000000000000000000..d961b2a0eb097c33eaa1502875427a3d319d7477 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/optimizer.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a59c0aec6b7b2acd107a1fc2ce2c0b0c37b3155d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a34d77c0efe0fc91b99995decc8738119cac6a5a98dc018229481550e11c2bf9 +size 148047722 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/rng_state.pth b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab736d336e2dff530550c3e35454660c3605397d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4acfab0b232074de1c3ba342e3e8294c305708ad453d7aa55bc380b7c98a10b +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/scheduler.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c07b610e877e513fda3813a64af716a38654c2f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ba022925b97a0c60fdb73ede217e52b3b55c5065f112ff19fea77b6a69dd5d +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/trainer_state.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7544ce15e9e0c0636c7882805f93c1dc624bdcc0 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/trainer_state.json @@ -0,0 +1,650 @@ +{ + "best_metric": 0.65595973, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245", + "epoch": 4.909090909090909, + "eval_steps": 20, + "global_step": 245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.3113684058189392, + "learning_rate": 7.692307692307694e-06, + "loss": 0.7323381900787354, + "memory(GiB)": 13.01, + "step": 1, + "token_acc": 0.8218189768255356, + "train_speed(iter/s)": 0.062844 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.971348226070404, + "learning_rate": 3.846153846153846e-05, + "loss": 1.1823183298110962, + "memory(GiB)": 22.91, + "step": 5, + "token_acc": 0.7330933259484907, + "train_speed(iter/s)": 0.166808 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.3070746958255768, + "learning_rate": 7.692307692307693e-05, + "loss": 0.9501662254333496, + "memory(GiB)": 29.36, + "step": 10, + "token_acc": 0.7653597716689199, + "train_speed(iter/s)": 0.214042 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.45415690541267395, + "learning_rate": 9.99816643111642e-05, + "loss": 0.8983875274658203, + "memory(GiB)": 35.88, + "step": 15, + "token_acc": 0.7695364238410596, + "train_speed(iter/s)": 0.236498 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.23825205862522125, + "learning_rate": 9.977554222133292e-05, + "loss": 0.7643344879150391, + "memory(GiB)": 35.89, + "step": 20, + "token_acc": 0.7882758359525883, + "train_speed(iter/s)": 0.25003 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.9797316789627075, + "eval_runtime": 0.6501, + "eval_samples_per_second": 6.153, + "eval_steps_per_second": 6.153, + "eval_token_acc": 0.6692913385826772, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1587868332862854, + "learning_rate": 9.934132612707632e-05, + "loss": 0.7561020851135254, + "memory(GiB)": 45.29, + "step": 25, + "token_acc": 0.7834674238838933, + "train_speed(iter/s)": 0.254419 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.1786944419145584, + "learning_rate": 9.868100580255466e-05, + "loss": 0.8542494773864746, + "memory(GiB)": 45.29, + "step": 30, + "token_acc": 0.753417087278488, + "train_speed(iter/s)": 0.260953 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.19432976841926575, + "learning_rate": 9.779760713358059e-05, + "loss": 0.7791708469390869, + "memory(GiB)": 45.29, + "step": 35, + "token_acc": 0.7836566725455615, + "train_speed(iter/s)": 0.266293 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.19285419583320618, + "learning_rate": 9.669517825164434e-05, + "loss": 0.6533683776855469, + "memory(GiB)": 45.29, + "step": 40, + "token_acc": 0.8123940031205481, + "train_speed(iter/s)": 0.26967 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.7691243290901184, + "eval_runtime": 0.6812, + "eval_samples_per_second": 5.872, + "eval_steps_per_second": 5.872, + "eval_token_acc": 0.6860236220472441, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.16239717602729797, + "learning_rate": 9.537877098354786e-05, + "loss": 0.6813922882080078, + "memory(GiB)": 45.29, + "step": 45, + "token_acc": 0.793583974452025, + "train_speed(iter/s)": 0.270446 + }, + { + "epoch": 1.0, + "grad_norm": 0.13025595247745514, + "learning_rate": 9.385441770165385e-05, + "loss": 0.8526185989379883, + "memory(GiB)": 45.29, + "step": 50, + "token_acc": 0.7657785542694912, + "train_speed(iter/s)": 0.275541 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.17670190334320068, + "learning_rate": 9.212910368083245e-05, + "loss": 0.8316545486450195, + "memory(GiB)": 45.29, + "step": 55, + "token_acc": 0.7762060024132885, + "train_speed(iter/s)": 0.277057 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.17425084114074707, + "learning_rate": 9.021073508877845e-05, + "loss": 0.7688841819763184, + "memory(GiB)": 45.29, + "step": 60, + "token_acc": 0.7774478686734241, + "train_speed(iter/s)": 0.278339 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.7357966899871826, + "eval_runtime": 0.7931, + "eval_samples_per_second": 5.044, + "eval_steps_per_second": 5.044, + "eval_token_acc": 0.6919291338582677, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.21490731835365295, + "learning_rate": 8.810810275638183e-05, + "loss": 0.8001401901245118, + "memory(GiB)": 45.29, + "step": 65, + "token_acc": 0.7726751710112775, + "train_speed(iter/s)": 0.277964 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15898309648036957, + "learning_rate": 8.583084189417224e-05, + "loss": 0.5408373832702636, + "memory(GiB)": 45.29, + "step": 70, + "token_acc": 0.8380025125628141, + "train_speed(iter/s)": 0.278722 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.15890304744243622, + "learning_rate": 8.338938793943478e-05, + "loss": 0.7484941005706787, + "memory(GiB)": 45.29, + "step": 75, + "token_acc": 0.796329302952936, + "train_speed(iter/s)": 0.280163 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.2820371389389038, + "learning_rate": 8.079492873632554e-05, + "loss": 0.76522216796875, + "memory(GiB)": 45.29, + "step": 80, + "token_acc": 0.773585676913015, + "train_speed(iter/s)": 0.280971 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.7128127813339233, + "eval_runtime": 0.7446, + "eval_samples_per_second": 5.372, + "eval_steps_per_second": 5.372, + "eval_token_acc": 0.6988188976377953, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.17685988545417786, + "learning_rate": 7.805935326811912e-05, + "loss": 0.67728590965271, + "memory(GiB)": 45.29, + "step": 85, + "token_acc": 0.8014132731113863, + "train_speed(iter/s)": 0.280315 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.20143380761146545, + "learning_rate": 7.519519717652039e-05, + "loss": 0.6732349872589112, + "memory(GiB)": 45.29, + "step": 90, + "token_acc": 0.804056684575255, + "train_speed(iter/s)": 0.280753 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.14572596549987793, + "learning_rate": 7.221558531769519e-05, + "loss": 0.667899227142334, + "memory(GiB)": 45.29, + "step": 95, + "token_acc": 0.8067723818962669, + "train_speed(iter/s)": 0.28155 + }, + { + "epoch": 2.0, + "grad_norm": 0.30259954929351807, + "learning_rate": 6.91341716182545e-05, + "loss": 0.6598664283752441, + "memory(GiB)": 45.29, + "step": 100, + "token_acc": 0.8251867090022807, + "train_speed(iter/s)": 0.283616 + }, + { + "epoch": 2.0, + "eval_loss": 0.6895536184310913, + "eval_runtime": 0.5973, + "eval_samples_per_second": 6.697, + "eval_steps_per_second": 6.697, + "eval_token_acc": 0.6948818897637795, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.22523801028728485, + "learning_rate": 6.5965076506799e-05, + "loss": 0.5976018905639648, + "memory(GiB)": 45.29, + "step": 105, + "token_acc": 0.8109289125146409, + "train_speed(iter/s)": 0.283455 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17166492342948914, + "learning_rate": 6.272282220774091e-05, + "loss": 0.6538212299346924, + "memory(GiB)": 45.29, + "step": 110, + "token_acc": 0.803433422019511, + "train_speed(iter/s)": 0.2839 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.20610365271568298, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.6409964084625244, + "memory(GiB)": 45.29, + "step": 115, + "token_acc": 0.8130777268708304, + "train_speed(iter/s)": 0.284576 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.2860853672027588, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.6116076946258545, + "memory(GiB)": 45.29, + "step": 120, + "token_acc": 0.8325966178677157, + "train_speed(iter/s)": 0.285065 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.6733097434043884, + "eval_runtime": 0.6385, + "eval_samples_per_second": 6.264, + "eval_steps_per_second": 6.264, + "eval_token_acc": 0.6998031496062992, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.24673469364643097, + "learning_rate": 5.270694542927088e-05, + "loss": 0.598513412475586, + "memory(GiB)": 45.29, + "step": 125, + "token_acc": 0.8092562679963481, + "train_speed(iter/s)": 0.285334 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.21779406070709229, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.7576182365417481, + "memory(GiB)": 45.29, + "step": 130, + "token_acc": 0.7910772994877284, + "train_speed(iter/s)": 0.285892 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.233392596244812, + "learning_rate": 4.594206372362845e-05, + "loss": 0.5816587448120117, + "memory(GiB)": 45.29, + "step": 135, + "token_acc": 0.836008997627338, + "train_speed(iter/s)": 0.286263 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.21131905913352966, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.6801999568939209, + "memory(GiB)": 45.29, + "step": 140, + "token_acc": 0.7989492694138893, + "train_speed(iter/s)": 0.28699 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.674415647983551, + "eval_runtime": 0.6365, + "eval_samples_per_second": 6.284, + "eval_steps_per_second": 6.284, + "eval_token_acc": 0.6958661417322834, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.2413957566022873, + "learning_rate": 3.92514779894488e-05, + "loss": 0.7208431243896485, + "memory(GiB)": 45.29, + "step": 145, + "token_acc": 0.7931504515249617, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 3.0, + "grad_norm": 0.3715948164463043, + "learning_rate": 3.597244112544208e-05, + "loss": 0.7635963916778564, + "memory(GiB)": 45.29, + "step": 150, + "token_acc": 0.7863333044457925, + "train_speed(iter/s)": 0.288363 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.3262959420681, + "learning_rate": 3.275768486860149e-05, + "loss": 0.752556848526001, + "memory(GiB)": 45.29, + "step": 155, + "token_acc": 0.7921971532921459, + "train_speed(iter/s)": 0.288494 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.18450701236724854, + "learning_rate": 2.962194068331996e-05, + "loss": 0.608460807800293, + "memory(GiB)": 45.29, + "step": 160, + "token_acc": 0.8247789822248602, + "train_speed(iter/s)": 0.289163 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.6623361110687256, + "eval_runtime": 0.5658, + "eval_samples_per_second": 7.069, + "eval_steps_per_second": 7.069, + "eval_token_acc": 0.6958661417322834, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.3013140857219696, + "learning_rate": 2.65795779650105e-05, + "loss": 0.562100601196289, + "memory(GiB)": 45.29, + "step": 165, + "token_acc": 0.8357333974686588, + "train_speed(iter/s)": 0.288738 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.27321088314056396, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.6480295658111572, + "memory(GiB)": 45.29, + "step": 170, + "token_acc": 0.8038663373977466, + "train_speed(iter/s)": 0.288892 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.3005470633506775, + "learning_rate": 2.08302710446253e-05, + "loss": 0.6261944770812988, + "memory(GiB)": 45.29, + "step": 175, + "token_acc": 0.8233992624874288, + "train_speed(iter/s)": 0.289319 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.3772144019603729, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.5553826808929443, + "memory(GiB)": 45.29, + "step": 180, + "token_acc": 0.8288827847075246, + "train_speed(iter/s)": 0.289656 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.6648226380348206, + "eval_runtime": 0.6198, + "eval_samples_per_second": 6.454, + "eval_steps_per_second": 6.454, + "eval_token_acc": 0.6938976377952756, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.2717554569244385, + "learning_rate": 1.561502705732883e-05, + "loss": 0.6049529552459717, + "memory(GiB)": 45.29, + "step": 185, + "token_acc": 0.8187963431702451, + "train_speed(iter/s)": 0.289334 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.18251824378967285, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.6194498062133789, + "memory(GiB)": 45.29, + "step": 190, + "token_acc": 0.8140125301994185, + "train_speed(iter/s)": 0.289757 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.33976471424102783, + "learning_rate": 1.102933089792042e-05, + "loss": 0.666096544265747, + "memory(GiB)": 45.29, + "step": 195, + "token_acc": 0.8082351568879039, + "train_speed(iter/s)": 0.290079 + }, + { + "epoch": 4.0, + "grad_norm": 0.3477536737918854, + "learning_rate": 8.999294173332058e-06, + "loss": 0.5860945701599121, + "memory(GiB)": 45.29, + "step": 200, + "token_acc": 0.8207529927807731, + "train_speed(iter/s)": 0.291039 + }, + { + "epoch": 4.0, + "eval_loss": 0.6583613753318787, + "eval_runtime": 0.5726, + "eval_samples_per_second": 6.986, + "eval_steps_per_second": 6.986, + "eval_token_acc": 0.6909448818897638, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.44803500175476074, + "learning_rate": 7.157141191620548e-06, + "loss": 0.5250832557678222, + "memory(GiB)": 45.29, + "step": 205, + "token_acc": 0.8369385000963948, + "train_speed(iter/s)": 0.290757 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.22857555747032166, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.7626357078552246, + "memory(GiB)": 45.29, + "step": 210, + "token_acc": 0.7796052631578947, + "train_speed(iter/s)": 0.290909 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.2053779661655426, + "learning_rate": 4.069353111818913e-06, + "loss": 0.6494219779968262, + "memory(GiB)": 45.29, + "step": 215, + "token_acc": 0.8163639347849874, + "train_speed(iter/s)": 0.291173 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.3344842791557312, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.649469518661499, + "memory(GiB)": 45.29, + "step": 220, + "token_acc": 0.8097098023092473, + "train_speed(iter/s)": 0.291297 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.6562517881393433, + "eval_runtime": 0.5649, + "eval_samples_per_second": 7.081, + "eval_steps_per_second": 7.081, + "eval_token_acc": 0.6919291338582677, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.2932053506374359, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.5700400829315185, + "memory(GiB)": 45.29, + "step": 225, + "token_acc": 0.826302729528536, + "train_speed(iter/s)": 0.291192 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.29753655195236206, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.5672303676605225, + "memory(GiB)": 45.29, + "step": 230, + "token_acc": 0.8347479998375502, + "train_speed(iter/s)": 0.291184 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.2351677417755127, + "learning_rate": 4.577201710596612e-07, + "loss": 0.5433839797973633, + "memory(GiB)": 45.29, + "step": 235, + "token_acc": 0.8384622326504828, + "train_speed(iter/s)": 0.291437 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.2580409646034241, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.5680582523345947, + "memory(GiB)": 45.29, + "step": 240, + "token_acc": 0.829964260185847, + "train_speed(iter/s)": 0.291576 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.6566542387008667, + "eval_runtime": 0.5647, + "eval_samples_per_second": 7.084, + "eval_steps_per_second": 7.084, + "eval_token_acc": 0.6899606299212598, + "step": 240 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.32335183024406433, + "learning_rate": 0.0, + "loss": 0.553482961654663, + "memory(GiB)": 45.29, + "step": 245, + "token_acc": 0.8262166079933734, + "train_speed(iter/s)": 0.291506 + }, + { + "epoch": 4.909090909090909, + "eval_loss": 0.6559597253799438, + "eval_runtime": 0.6383, + "eval_samples_per_second": 6.267, + "eval_steps_per_second": 6.267, + "eval_token_acc": 0.6948818897637795, + "step": 245 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.430410829455872e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/training_args.bin b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c7d6f1d9bffa041371678b1cf39933953aabd78 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f597a6ef45ff19f5839a81e570c1f418eac5afbc9704f40518398d7021c02f +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1ac9173e52dce086cf3812eed8a46f4dad342931 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..9a9fbf80c509e38885219e1fc5b5da8d894cb8de Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..15511b8f78fd886808b6155ddd0fa7527969f6f0 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..c119a0f156dd763abbbf8342e413b59c8ce65698 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..22a5f5cd5ff3b0d9ce986c9f5ced33b35eaf795c Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..9cc2f5018175ad195aab7254264c4c8fdcaa870b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..97786515b03cb1e6b8f8733ef1b17b3e8334bf6b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..1a747bb27a6bf92800a1028c0eaaaa4305471d7f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7c51ecb59409164707f9675cd7e5f75927bc3575 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..94cb9ad35c11fc0a67bee501bd2b766388dc31ac Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..7288c3c3272b5266ff2a87bf360aae98614fbecf Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..5c962205c8b8a5c9dc4ad7c1873c4b930614d32d Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6461ed83337d61a32dbb471b1511449dc586b846 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..00ab972300dcb6d464554b0dc64c1af7715fee6d Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b7ff1ad38c9740096e4a6ee5c13ebdcb81affd52 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..1d7a52324be1573b60005e6fa890001c9a8dd6e1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3cc25dd54f75bd82ad66e9fd9fe2a6008321f903 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/logging.jsonl b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e36f4331641b0ffe6e377bdbc5da82b2046bcbd --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/logging.jsonl @@ -0,0 +1,65 @@ +{"loss": 0.73233819, "token_acc": 0.82181898, "grad_norm": 0.31136841, "learning_rate": 7.69e-06, "memory(GiB)": 13.01, "train_speed(iter/s)": 0.062844, "epoch": 0.02020202, "global_step/max_steps": "1/245", "percentage": "0.41%", "elapsed_time": "15s", "remaining_time": "1h 3m 45s"} +{"loss": 1.18231833, "token_acc": 0.73309333, "grad_norm": 0.97134823, "learning_rate": 3.846e-05, "memory(GiB)": 22.91, "train_speed(iter/s)": 0.166808, "epoch": 0.1010101, "global_step/max_steps": "5/245", "percentage": "2.04%", "elapsed_time": "29s", "remaining_time": "23m 47s"} +{"loss": 0.95016623, "token_acc": 0.76535977, "grad_norm": 0.3070747, "learning_rate": 7.692e-05, "memory(GiB)": 29.36, "train_speed(iter/s)": 0.214042, "epoch": 0.2020202, "global_step/max_steps": "10/245", "percentage": "4.08%", "elapsed_time": "46s", "remaining_time": "18m 12s"} +{"loss": 0.89838753, "token_acc": 0.76953642, "grad_norm": 0.45415691, "learning_rate": 9.998e-05, "memory(GiB)": 35.88, "train_speed(iter/s)": 0.236498, "epoch": 0.3030303, "global_step/max_steps": "15/245", "percentage": "6.12%", "elapsed_time": "1m 3s", "remaining_time": "16m 8s"} +{"loss": 0.76433449, "token_acc": 0.78827584, "grad_norm": 0.23825206, "learning_rate": 9.978e-05, "memory(GiB)": 35.89, "train_speed(iter/s)": 0.25003, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "1m 19s", "remaining_time": "14m 57s"} +{"eval_loss": 0.97973168, "eval_token_acc": 0.66929134, "eval_runtime": 0.6501, "eval_samples_per_second": 6.153, "eval_steps_per_second": 6.153, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "1m 20s", "remaining_time": "15m 4s"} +{"loss": 0.75610209, "token_acc": 0.78346742, "grad_norm": 0.15878683, "learning_rate": 9.934e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.254419, "epoch": 0.50505051, "global_step/max_steps": "25/245", "percentage": "10.20%", "elapsed_time": "1m 38s", "remaining_time": "14m 22s"} +{"loss": 0.85424948, "token_acc": 0.75341709, "grad_norm": 0.17869444, "learning_rate": 9.868e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.260953, "epoch": 0.60606061, "global_step/max_steps": "30/245", "percentage": "12.24%", "elapsed_time": "1m 54s", "remaining_time": "13m 42s"} +{"loss": 0.77917085, "token_acc": 0.78365667, "grad_norm": 0.19432977, "learning_rate": 9.78e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.266293, "epoch": 0.70707071, "global_step/max_steps": "35/245", "percentage": "14.29%", "elapsed_time": "2m 11s", "remaining_time": "13m 7s"} +{"loss": 0.65336838, "token_acc": 0.812394, "grad_norm": 0.1928542, "learning_rate": 9.67e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.26967, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "2m 28s", "remaining_time": "12m 39s"} +{"eval_loss": 0.76912433, "eval_token_acc": 0.68602362, "eval_runtime": 0.6812, "eval_samples_per_second": 5.872, "eval_steps_per_second": 5.872, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "2m 28s", "remaining_time": "12m 42s"} +{"loss": 0.68139229, "token_acc": 0.79358397, "grad_norm": 0.16239718, "learning_rate": 9.538e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.270446, "epoch": 0.90909091, "global_step/max_steps": "45/245", "percentage": "18.37%", "elapsed_time": "2m 46s", "remaining_time": "12m 18s"} +{"loss": 0.8526186, "token_acc": 0.76577855, "grad_norm": 0.13025595, "learning_rate": 9.385e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.275541, "epoch": 1.0, "global_step/max_steps": "50/245", "percentage": "20.41%", "elapsed_time": "3m 1s", "remaining_time": "11m 46s"} +{"loss": 0.83165455, "token_acc": 0.776206, "grad_norm": 0.1767019, "learning_rate": 9.213e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.277057, "epoch": 1.1010101, "global_step/max_steps": "55/245", "percentage": "22.45%", "elapsed_time": "3m 18s", "remaining_time": "11m 24s"} +{"loss": 0.76888418, "token_acc": 0.77744787, "grad_norm": 0.17425084, "learning_rate": 9.021e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.278339, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "3m 35s", "remaining_time": "11m 3s"} +{"eval_loss": 0.73579669, "eval_token_acc": 0.69192913, "eval_runtime": 0.7931, "eval_samples_per_second": 5.044, "eval_steps_per_second": 5.044, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "3m 36s", "remaining_time": "11m 6s"} +{"loss": 0.80014019, "token_acc": 0.77267517, "grad_norm": 0.21490732, "learning_rate": 8.811e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.277964, "epoch": 1.3030303, "global_step/max_steps": "65/245", "percentage": "26.53%", "elapsed_time": "3m 53s", "remaining_time": "10m 46s"} +{"loss": 0.54083738, "token_acc": 0.83800251, "grad_norm": 0.1589831, "learning_rate": 8.583e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.278722, "epoch": 1.4040404, "global_step/max_steps": "70/245", "percentage": "28.57%", "elapsed_time": "4m 10s", "remaining_time": "10m 27s"} +{"loss": 0.7484941, "token_acc": 0.7963293, "grad_norm": 0.15890305, "learning_rate": 8.339e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280163, "epoch": 1.50505051, "global_step/max_steps": "75/245", "percentage": "30.61%", "elapsed_time": "4m 27s", "remaining_time": "10m 6s"} +{"loss": 0.76522217, "token_acc": 0.77358568, "grad_norm": 0.28203714, "learning_rate": 8.079e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280971, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "4m 44s", "remaining_time": "9m 46s"} +{"eval_loss": 0.71281278, "eval_token_acc": 0.6988189, "eval_runtime": 0.7446, "eval_samples_per_second": 5.372, "eval_steps_per_second": 5.372, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "4m 45s", "remaining_time": "9m 48s"} +{"loss": 0.67728591, "token_acc": 0.80141327, "grad_norm": 0.17685989, "learning_rate": 7.806e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280315, "epoch": 1.70707071, "global_step/max_steps": "85/245", "percentage": "34.69%", "elapsed_time": "5m 2s", "remaining_time": "9m 30s"} +{"loss": 0.67323499, "token_acc": 0.80405668, "grad_norm": 0.20143381, "learning_rate": 7.52e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280753, "epoch": 1.80808081, "global_step/max_steps": "90/245", "percentage": "36.73%", "elapsed_time": "5m 20s", "remaining_time": "9m 11s"} +{"loss": 0.66789923, "token_acc": 0.80677238, "grad_norm": 0.14572597, "learning_rate": 7.222e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.28155, "epoch": 1.90909091, "global_step/max_steps": "95/245", "percentage": "38.78%", "elapsed_time": "5m 37s", "remaining_time": "8m 52s"} +{"loss": 0.65986643, "token_acc": 0.82518671, "grad_norm": 0.30259955, "learning_rate": 6.913e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.283616, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "5m 52s", "remaining_time": "8m 30s"} +{"eval_loss": 0.68955362, "eval_token_acc": 0.69488189, "eval_runtime": 0.5973, "eval_samples_per_second": 6.697, "eval_steps_per_second": 6.697, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "5m 52s", "remaining_time": "8m 31s"} +{"loss": 0.59760189, "token_acc": 0.81092891, "grad_norm": 0.22523801, "learning_rate": 6.597e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.283455, "epoch": 2.1010101, "global_step/max_steps": "105/245", "percentage": "42.86%", "elapsed_time": "6m 10s", "remaining_time": "8m 13s"} +{"loss": 0.65382123, "token_acc": 0.80343342, "grad_norm": 0.17166492, "learning_rate": 6.272e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.2839, "epoch": 2.2020202, "global_step/max_steps": "110/245", "percentage": "44.90%", "elapsed_time": "6m 27s", "remaining_time": "7m 55s"} +{"loss": 0.64099641, "token_acc": 0.81307773, "grad_norm": 0.20610365, "learning_rate": 5.942e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.284576, "epoch": 2.3030303, "global_step/max_steps": "115/245", "percentage": "46.94%", "elapsed_time": "6m 43s", "remaining_time": "7m 36s"} +{"loss": 0.61160769, "token_acc": 0.83259662, "grad_norm": 0.28608537, "learning_rate": 5.608e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285065, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "7m 0s", "remaining_time": "7m 18s"} +{"eval_loss": 0.67330974, "eval_token_acc": 0.69980315, "eval_runtime": 0.6385, "eval_samples_per_second": 6.264, "eval_steps_per_second": 6.264, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "7m 1s", "remaining_time": "7m 18s"} +{"loss": 0.59851341, "token_acc": 0.80925627, "grad_norm": 0.24673469, "learning_rate": 5.271e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285334, "epoch": 2.50505051, "global_step/max_steps": "125/245", "percentage": "51.02%", "elapsed_time": "7m 17s", "remaining_time": "7m 0s"} +{"loss": 0.75761824, "token_acc": 0.7910773, "grad_norm": 0.21779406, "learning_rate": 4.932e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285892, "epoch": 2.60606061, "global_step/max_steps": "130/245", "percentage": "53.06%", "elapsed_time": "7m 34s", "remaining_time": "6m 42s"} +{"loss": 0.58165874, "token_acc": 0.836009, "grad_norm": 0.2333926, "learning_rate": 4.594e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.286263, "epoch": 2.70707071, "global_step/max_steps": "135/245", "percentage": "55.10%", "elapsed_time": "7m 51s", "remaining_time": "6m 24s"} +{"loss": 0.68019996, "token_acc": 0.79894927, "grad_norm": 0.21131906, "learning_rate": 4.258e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.28699, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "8m 7s", "remaining_time": "6m 5s"} +{"eval_loss": 0.67441565, "eval_token_acc": 0.69586614, "eval_runtime": 0.6365, "eval_samples_per_second": 6.284, "eval_steps_per_second": 6.284, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "8m 8s", "remaining_time": "6m 6s"} +{"loss": 0.72084312, "token_acc": 0.79315045, "grad_norm": 0.24139576, "learning_rate": 3.925e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.286939, "epoch": 2.90909091, "global_step/max_steps": "145/245", "percentage": "59.18%", "elapsed_time": "8m 25s", "remaining_time": "5m 48s"} +{"loss": 0.76359639, "token_acc": 0.7863333, "grad_norm": 0.37159482, "learning_rate": 3.597e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288363, "epoch": 3.0, "global_step/max_steps": "150/245", "percentage": "61.22%", "elapsed_time": "8m 39s", "remaining_time": "5m 29s"} +{"loss": 0.75255685, "token_acc": 0.79219715, "grad_norm": 0.32629594, "learning_rate": 3.276e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288494, "epoch": 3.1010101, "global_step/max_steps": "155/245", "percentage": "63.27%", "elapsed_time": "8m 57s", "remaining_time": "5m 11s"} +{"loss": 0.60846081, "token_acc": 0.82477898, "grad_norm": 0.18450701, "learning_rate": 2.962e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289163, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "9m 13s", "remaining_time": "4m 53s"} +{"eval_loss": 0.66233611, "eval_token_acc": 0.69586614, "eval_runtime": 0.5658, "eval_samples_per_second": 7.069, "eval_steps_per_second": 7.069, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "9m 13s", "remaining_time": "4m 54s"} +{"loss": 0.5621006, "token_acc": 0.8357334, "grad_norm": 0.30131409, "learning_rate": 2.658e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288738, "epoch": 3.3030303, "global_step/max_steps": "165/245", "percentage": "67.35%", "elapsed_time": "9m 31s", "remaining_time": "4m 36s"} +{"loss": 0.64802957, "token_acc": 0.80386634, "grad_norm": 0.27321088, "learning_rate": 2.364e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288892, "epoch": 3.4040404, "global_step/max_steps": "170/245", "percentage": "69.39%", "elapsed_time": "9m 48s", "remaining_time": "4m 19s"} +{"loss": 0.62619448, "token_acc": 0.82339926, "grad_norm": 0.30054706, "learning_rate": 2.083e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289319, "epoch": 3.50505051, "global_step/max_steps": "175/245", "percentage": "71.43%", "elapsed_time": "10m 4s", "remaining_time": "4m 1s"} +{"loss": 0.55538268, "token_acc": 0.82888278, "grad_norm": 0.3772144, "learning_rate": 1.815e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289656, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "10m 21s", "remaining_time": "3m 44s"} +{"eval_loss": 0.66482264, "eval_token_acc": 0.69389764, "eval_runtime": 0.6198, "eval_samples_per_second": 6.454, "eval_steps_per_second": 6.454, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "10m 21s", "remaining_time": "3m 44s"} +{"loss": 0.60495296, "token_acc": 0.81879634, "grad_norm": 0.27175546, "learning_rate": 1.562e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289334, "epoch": 3.70707071, "global_step/max_steps": "185/245", "percentage": "75.51%", "elapsed_time": "10m 39s", "remaining_time": "3m 27s"} +{"loss": 0.61944981, "token_acc": 0.81401253, "grad_norm": 0.18251824, "learning_rate": 1.324e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289757, "epoch": 3.80808081, "global_step/max_steps": "190/245", "percentage": "77.55%", "elapsed_time": "10m 55s", "remaining_time": "3m 9s"} +{"loss": 0.66609654, "token_acc": 0.80823516, "grad_norm": 0.33976471, "learning_rate": 1.103e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290079, "epoch": 3.90909091, "global_step/max_steps": "195/245", "percentage": "79.59%", "elapsed_time": "11m 12s", "remaining_time": "2m 52s"} +{"loss": 0.58609457, "token_acc": 0.82075299, "grad_norm": 0.34775367, "learning_rate": 9e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291039, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "11m 26s", "remaining_time": "2m 34s"} +{"eval_loss": 0.65836138, "eval_token_acc": 0.69094488, "eval_runtime": 0.5726, "eval_samples_per_second": 6.986, "eval_steps_per_second": 6.986, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "11m 27s", "remaining_time": "2m 34s"} +{"loss": 0.52508326, "token_acc": 0.8369385, "grad_norm": 0.448035, "learning_rate": 7.16e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290757, "epoch": 4.1010101, "global_step/max_steps": "205/245", "percentage": "83.67%", "elapsed_time": "11m 44s", "remaining_time": "2m 17s"} +{"loss": 0.76263571, "token_acc": 0.77960526, "grad_norm": 0.22857556, "learning_rate": 5.51e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290909, "epoch": 4.2020202, "global_step/max_steps": "210/245", "percentage": "85.71%", "elapsed_time": "12m 1s", "remaining_time": "2m 0s"} +{"loss": 0.64942198, "token_acc": 0.81636393, "grad_norm": 0.20537797, "learning_rate": 4.07e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291173, "epoch": 4.3030303, "global_step/max_steps": "215/245", "percentage": "87.76%", "elapsed_time": "12m 18s", "remaining_time": "1m 42s"} +{"loss": 0.64946952, "token_acc": 0.8097098, "grad_norm": 0.33448428, "learning_rate": 2.84e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291297, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "12m 35s", "remaining_time": "1m 25s"} +{"eval_loss": 0.65625179, "eval_token_acc": 0.69192913, "eval_runtime": 0.5649, "eval_samples_per_second": 7.081, "eval_steps_per_second": 7.081, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "12m 35s", "remaining_time": "1m 25s"} +{"loss": 0.57004008, "token_acc": 0.82630273, "grad_norm": 0.29320535, "learning_rate": 1.82e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291192, "epoch": 4.50505051, "global_step/max_steps": "225/245", "percentage": "91.84%", "elapsed_time": "12m 52s", "remaining_time": "1m 8s"} +{"loss": 0.56723037, "token_acc": 0.834748, "grad_norm": 0.29753655, "learning_rate": 1.03e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291184, "epoch": 4.60606061, "global_step/max_steps": "230/245", "percentage": "93.88%", "elapsed_time": "13m 9s", "remaining_time": "51s"} +{"loss": 0.54338398, "token_acc": 0.83846223, "grad_norm": 0.23516774, "learning_rate": 4.6e-07, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291437, "epoch": 4.70707071, "global_step/max_steps": "235/245", "percentage": "95.92%", "elapsed_time": "13m 26s", "remaining_time": "34s"} +{"loss": 0.56805825, "token_acc": 0.82996426, "grad_norm": 0.25804096, "learning_rate": 1.1e-07, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291576, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "13m 42s", "remaining_time": "17s"} +{"eval_loss": 0.65665424, "eval_token_acc": 0.68996063, "eval_runtime": 0.5647, "eval_samples_per_second": 7.084, "eval_steps_per_second": 7.084, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "13m 43s", "remaining_time": "17s"} +{"loss": 0.55348296, "token_acc": 0.82621661, "grad_norm": 0.32335183, "learning_rate": 0.0, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291506, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 0s", "remaining_time": "0s"} +{"eval_loss": 0.65595973, "eval_token_acc": 0.69488189, "eval_runtime": 0.6383, "eval_samples_per_second": 6.267, "eval_steps_per_second": 6.267, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 0s", "remaining_time": "0s"} +{"train_runtime": 841.285, "train_samples_per_second": 2.354, "train_steps_per_second": 0.291, "total_flos": 1.430410829455872e+16, "train_loss": 0.68653497, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 1s", "remaining_time": "0s"} +{"train_dataset": "784.851010±638.096273, min=60.000000, max=4149.000000, size=396", "val_dataset": "325.750000±308.768825, min=104.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1795.5528M Params (18.4648M Trainable [1.0284%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/checkpoint-245", "best_metric": 0.65595973, "global_step": 245, "log_history": [{"loss": 0.7323381900787354, "token_acc": 0.8218189768255356, "grad_norm": 0.3113684058189392, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 13.01, "train_speed(iter/s)": 0.062844, "epoch": 0.020202020202020204, "step": 1}, {"loss": 1.1823183298110962, "token_acc": 0.7330933259484907, "grad_norm": 0.971348226070404, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 22.91, "train_speed(iter/s)": 0.166808, "epoch": 0.10101010101010101, "step": 5}, {"loss": 0.9501662254333496, "token_acc": 0.7653597716689199, "grad_norm": 0.3070746958255768, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 29.36, "train_speed(iter/s)": 0.214042, "epoch": 0.20202020202020202, "step": 10}, {"loss": 0.8983875274658203, "token_acc": 0.7695364238410596, "grad_norm": 0.45415690541267395, "learning_rate": 9.99816643111642e-05, "memory(GiB)": 35.88, "train_speed(iter/s)": 0.236498, "epoch": 0.30303030303030304, "step": 15}, {"loss": 0.7643344879150391, "token_acc": 0.7882758359525883, "grad_norm": 0.23825205862522125, "learning_rate": 9.977554222133292e-05, "memory(GiB)": 35.89, "train_speed(iter/s)": 0.25003, "epoch": 0.40404040404040403, "step": 20}, {"eval_loss": 0.9797316789627075, "eval_token_acc": 0.6692913385826772, "eval_runtime": 0.6501, "eval_samples_per_second": 6.153, "eval_steps_per_second": 6.153, "epoch": 0.40404040404040403, "step": 20}, {"loss": 0.7561020851135254, "token_acc": 0.7834674238838933, "grad_norm": 0.1587868332862854, "learning_rate": 9.934132612707632e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.254419, "epoch": 0.5050505050505051, "step": 25}, {"loss": 0.8542494773864746, "token_acc": 0.753417087278488, "grad_norm": 0.1786944419145584, "learning_rate": 9.868100580255466e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.260953, "epoch": 0.6060606060606061, "step": 30}, {"loss": 0.7791708469390869, "token_acc": 0.7836566725455615, "grad_norm": 0.19432976841926575, "learning_rate": 9.779760713358059e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.266293, "epoch": 0.7070707070707071, "step": 35}, {"loss": 0.6533683776855469, "token_acc": 0.8123940031205481, "grad_norm": 0.19285419583320618, "learning_rate": 9.669517825164434e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.26967, "epoch": 0.8080808080808081, "step": 40}, {"eval_loss": 0.7691243290901184, "eval_token_acc": 0.6860236220472441, "eval_runtime": 0.6812, "eval_samples_per_second": 5.872, "eval_steps_per_second": 5.872, "epoch": 0.8080808080808081, "step": 40}, {"loss": 0.6813922882080078, "token_acc": 0.793583974452025, "grad_norm": 0.16239717602729797, "learning_rate": 9.537877098354786e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.270446, "epoch": 0.9090909090909091, "step": 45}, {"loss": 0.8526185989379883, "token_acc": 0.7657785542694912, "grad_norm": 0.13025595247745514, "learning_rate": 9.385441770165385e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.275541, "epoch": 1.0, "step": 50}, {"loss": 0.8316545486450195, "token_acc": 0.7762060024132885, "grad_norm": 0.17670190334320068, "learning_rate": 9.212910368083245e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.277057, "epoch": 1.101010101010101, "step": 55}, {"loss": 0.7688841819763184, "token_acc": 0.7774478686734241, "grad_norm": 0.17425084114074707, "learning_rate": 9.021073508877845e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.278339, "epoch": 1.202020202020202, "step": 60}, {"eval_loss": 0.7357966899871826, "eval_token_acc": 0.6919291338582677, "eval_runtime": 0.7931, "eval_samples_per_second": 5.044, "eval_steps_per_second": 5.044, "epoch": 1.202020202020202, "step": 60}, {"loss": 0.8001401901245118, "token_acc": 0.7726751710112775, "grad_norm": 0.21490731835365295, "learning_rate": 8.810810275638183e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.277964, "epoch": 1.303030303030303, "step": 65}, {"loss": 0.5408373832702636, "token_acc": 0.8380025125628141, "grad_norm": 0.15898309648036957, "learning_rate": 8.583084189417224e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.278722, "epoch": 1.404040404040404, "step": 70}, {"loss": 0.7484941005706787, "token_acc": 0.796329302952936, "grad_norm": 0.15890304744243622, "learning_rate": 8.338938793943478e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280163, "epoch": 1.5050505050505052, "step": 75}, {"loss": 0.76522216796875, "token_acc": 0.773585676913015, "grad_norm": 0.2820371389389038, "learning_rate": 8.079492873632554e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280971, "epoch": 1.606060606060606, "step": 80}, {"eval_loss": 0.7128127813339233, "eval_token_acc": 0.6988188976377953, "eval_runtime": 0.7446, "eval_samples_per_second": 5.372, "eval_steps_per_second": 5.372, "epoch": 1.606060606060606, "step": 80}, {"loss": 0.67728590965271, "token_acc": 0.8014132731113863, "grad_norm": 0.17685988545417786, "learning_rate": 7.805935326811912e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280315, "epoch": 1.7070707070707072, "step": 85}, {"loss": 0.6732349872589112, "token_acc": 0.804056684575255, "grad_norm": 0.20143380761146545, "learning_rate": 7.519519717652039e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.280753, "epoch": 1.808080808080808, "step": 90}, {"loss": 0.667899227142334, "token_acc": 0.8067723818962669, "grad_norm": 0.14572596549987793, "learning_rate": 7.221558531769519e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.28155, "epoch": 1.9090909090909092, "step": 95}, {"loss": 0.6598664283752441, "token_acc": 0.8251867090022807, "grad_norm": 0.30259954929351807, "learning_rate": 6.91341716182545e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.283616, "epoch": 2.0, "step": 100}, {"eval_loss": 0.6895536184310913, "eval_token_acc": 0.6948818897637795, "eval_runtime": 0.5973, "eval_samples_per_second": 6.697, "eval_steps_per_second": 6.697, "epoch": 2.0, "step": 100}, {"loss": 0.5976018905639648, "token_acc": 0.8109289125146409, "grad_norm": 0.22523801028728485, "learning_rate": 6.5965076506799e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.283455, "epoch": 2.101010101010101, "step": 105}, {"loss": 0.6538212299346924, "token_acc": 0.803433422019511, "grad_norm": 0.17166492342948914, "learning_rate": 6.272282220774091e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.2839, "epoch": 2.202020202020202, "step": 110}, {"loss": 0.6409964084625244, "token_acc": 0.8130777268708304, "grad_norm": 0.20610365271568298, "learning_rate": 5.9422266193915924e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.284576, "epoch": 2.303030303030303, "step": 115}, {"loss": 0.6116076946258545, "token_acc": 0.8325966178677157, "grad_norm": 0.2860853672027588, "learning_rate": 5.6078533102935745e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285065, "epoch": 2.404040404040404, "step": 120}, {"eval_loss": 0.6733097434043884, "eval_token_acc": 0.6998031496062992, "eval_runtime": 0.6385, "eval_samples_per_second": 6.264, "eval_steps_per_second": 6.264, "epoch": 2.404040404040404, "step": 120}, {"loss": 0.598513412475586, "token_acc": 0.8092562679963481, "grad_norm": 0.24673469364643097, "learning_rate": 5.270694542927088e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285334, "epoch": 2.505050505050505, "step": 125}, {"loss": 0.7576182365417481, "token_acc": 0.7910772994877284, "grad_norm": 0.21779406070709229, "learning_rate": 4.9322953309663916e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.285892, "epoch": 2.606060606060606, "step": 130}, {"loss": 0.5816587448120117, "token_acc": 0.836008997627338, "grad_norm": 0.233392596244812, "learning_rate": 4.594206372362845e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.286263, "epoch": 2.707070707070707, "step": 135}, {"loss": 0.6801999568939209, "token_acc": 0.7989492694138893, "grad_norm": 0.21131905913352966, "learning_rate": 4.2579769433468694e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.28699, "epoch": 2.808080808080808, "step": 140}, {"eval_loss": 0.674415647983551, "eval_token_acc": 0.6958661417322834, "eval_runtime": 0.6365, "eval_samples_per_second": 6.284, "eval_steps_per_second": 6.284, "epoch": 2.808080808080808, "step": 140}, {"loss": 0.7208431243896485, "token_acc": 0.7931504515249617, "grad_norm": 0.2413957566022873, "learning_rate": 3.92514779894488e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.286939, "epoch": 2.909090909090909, "step": 145}, {"loss": 0.7635963916778564, "token_acc": 0.7863333044457925, "grad_norm": 0.3715948164463043, "learning_rate": 3.597244112544208e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288363, "epoch": 3.0, "step": 150}, {"loss": 0.752556848526001, "token_acc": 0.7921971532921459, "grad_norm": 0.3262959420681, "learning_rate": 3.275768486860149e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288494, "epoch": 3.101010101010101, "step": 155}, {"loss": 0.608460807800293, "token_acc": 0.8247789822248602, "grad_norm": 0.18450701236724854, "learning_rate": 2.962194068331996e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289163, "epoch": 3.202020202020202, "step": 160}, {"eval_loss": 0.6623361110687256, "eval_token_acc": 0.6958661417322834, "eval_runtime": 0.5658, "eval_samples_per_second": 7.069, "eval_steps_per_second": 7.069, "epoch": 3.202020202020202, "step": 160}, {"loss": 0.562100601196289, "token_acc": 0.8357333974686588, "grad_norm": 0.3013140857219696, "learning_rate": 2.65795779650105e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288738, "epoch": 3.303030303030303, "step": 165}, {"loss": 0.6480295658111572, "token_acc": 0.8038663373977466, "grad_norm": 0.27321088314056396, "learning_rate": 2.3644538193049625e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.288892, "epoch": 3.404040404040404, "step": 170}, {"loss": 0.6261944770812988, "token_acc": 0.8233992624874288, "grad_norm": 0.3005470633506775, "learning_rate": 2.08302710446253e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289319, "epoch": 3.505050505050505, "step": 175}, {"loss": 0.5553826808929443, "token_acc": 0.8288827847075246, "grad_norm": 0.3772144019603729, "learning_rate": 1.8149672762244624e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289656, "epoch": 3.606060606060606, "step": 180}, {"eval_loss": 0.6648226380348206, "eval_token_acc": 0.6938976377952756, "eval_runtime": 0.6198, "eval_samples_per_second": 6.454, "eval_steps_per_second": 6.454, "epoch": 3.606060606060606, "step": 180}, {"loss": 0.6049529552459717, "token_acc": 0.8187963431702451, "grad_norm": 0.2717554569244385, "learning_rate": 1.561502705732883e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289334, "epoch": 3.707070707070707, "step": 185}, {"loss": 0.6194498062133789, "token_acc": 0.8140125301994185, "grad_norm": 0.18251824378967285, "learning_rate": 1.3237948820702495e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.289757, "epoch": 3.808080808080808, "step": 190}, {"loss": 0.666096544265747, "token_acc": 0.8082351568879039, "grad_norm": 0.33976471424102783, "learning_rate": 1.102933089792042e-05, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290079, "epoch": 3.909090909090909, "step": 195}, {"loss": 0.5860945701599121, "token_acc": 0.8207529927807731, "grad_norm": 0.3477536737918854, "learning_rate": 8.999294173332058e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291039, "epoch": 4.0, "step": 200}, {"eval_loss": 0.6583613753318787, "eval_token_acc": 0.6909448818897638, "eval_runtime": 0.5726, "eval_samples_per_second": 6.986, "eval_steps_per_second": 6.986, "epoch": 4.0, "step": 200}, {"loss": 0.5250832557678222, "token_acc": 0.8369385000963948, "grad_norm": 0.44803500175476074, "learning_rate": 7.157141191620548e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290757, "epoch": 4.101010101010101, "step": 205}, {"loss": 0.7626357078552246, "token_acc": 0.7796052631578947, "grad_norm": 0.22857555747032166, "learning_rate": 5.5113135293435815e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.290909, "epoch": 4.202020202020202, "step": 210}, {"loss": 0.6494219779968262, "token_acc": 0.8163639347849874, "grad_norm": 0.2053779661655426, "learning_rate": 4.069353111818913e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291173, "epoch": 4.303030303030303, "step": 215}, {"loss": 0.649469518661499, "token_acc": 0.8097098023092473, "grad_norm": 0.3344842791557312, "learning_rate": 2.8378676526178482e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291297, "epoch": 4.404040404040404, "step": 220}, {"eval_loss": 0.6562517881393433, "eval_token_acc": 0.6919291338582677, "eval_runtime": 0.5649, "eval_samples_per_second": 7.081, "eval_steps_per_second": 7.081, "epoch": 4.404040404040404, "step": 220}, {"loss": 0.5700400829315185, "token_acc": 0.826302729528536, "grad_norm": 0.2932053506374359, "learning_rate": 1.8225003740388547e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291192, "epoch": 4.505050505050505, "step": 225}, {"loss": 0.5672303676605225, "token_acc": 0.8347479998375502, "grad_norm": 0.29753655195236206, "learning_rate": 1.0279041473154116e-06, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291184, "epoch": 4.606060606060606, "step": 230}, {"loss": 0.5433839797973633, "token_acc": 0.8384622326504828, "grad_norm": 0.2351677417755127, "learning_rate": 4.577201710596612e-07, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291437, "epoch": 4.707070707070707, "step": 235}, {"loss": 0.5680582523345947, "token_acc": 0.829964260185847, "grad_norm": 0.2580409646034241, "learning_rate": 1.1456128564660273e-07, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291576, "epoch": 4.808080808080808, "step": 240}, {"eval_loss": 0.6566542387008667, "eval_token_acc": 0.6899606299212598, "eval_runtime": 0.5647, "eval_samples_per_second": 7.084, "eval_steps_per_second": 7.084, "epoch": 4.808080808080808, "step": 240}, {"loss": 0.553482961654663, "token_acc": 0.8262166079933734, "grad_norm": 0.32335183024406433, "learning_rate": 0.0, "memory(GiB)": 45.29, "train_speed(iter/s)": 0.291506, "epoch": 4.909090909090909, "step": 245}, {"eval_loss": 0.6559597253799438, "eval_token_acc": 0.6948818897637795, "eval_runtime": 0.6383, "eval_samples_per_second": 6.267, "eval_steps_per_second": 6.267, "epoch": 4.909090909090909, "step": 245}, {"train_runtime": 841.285, "train_samples_per_second": 2.354, "train_steps_per_second": 0.291, "total_flos": 1.430410829455872e+16, "train_loss": 0.6865349672278579, "epoch": 4.909090909090909, "step": 245}], "memory": 45.291015625} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs/events.out.tfevents.1737746575.kml-dtmachine-18088-prod.46041.0 b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs/events.out.tfevents.1737746575.kml-dtmachine-18088-prod.46041.0 new file mode 100644 index 0000000000000000000000000000000000000000..48ac67fdea4b6d67cd9272961f3522e309d656e6 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-192220/runs/events.out.tfevents.1737746575.kml-dtmachine-18088-prod.46041.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace171445f51852472a86855f15495d57bb770512eed5806a988c0c9efbff16c +size 29549 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/args.json new file mode 100644 index 0000000000000000000000000000000000000000..a0eafede89bbb0ea69bffca2a0848ff566ad4e21 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/README.md b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a5865f2203e9f5deb586ec0045e256414b71aa9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..363aca81812da77a1fa9579d24a72a9494db8e39 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8cfa0005017734fc6ba927f903bcd47c6c5db93fa782814e90ab52d619a0973 +size 73911112 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/additional_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..a0eafede89bbb0ea69bffca2a0848ff566ad4e21 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/optimizer.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..73ddb4ee59de874cfde648c2d5bca629f951c6ef --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53dd37603a6da9f29a493847066480374f9b9bad81c78f282b02d58ebd39e65b +size 148047722 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/rng_state.pth b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..14b16dac6b065ac1071d44b95b02f246a1401bf9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:497ca48834b612ae7ed4bd48a0af2ffd6a0e2271285be9659b602f337b03c861 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/scheduler.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a7d8bc4abba06c30f7a7a85d5d4173984e4c9c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb96daf4c69d48b38d52e8ae266563af1b957cba209e2ff4307709a50fc6770 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/trainer_state.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..30cf745027976bf8ba65b9a5379cb94472f6e567 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_metric": 0.63181639, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240", + "epoch": 4.808080808080808, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.3297547698020935, + "learning_rate": 7.692307692307694e-06, + "loss": 0.7305145859718323, + "memory(GiB)": 12.91, + "step": 1, + "token_acc": 0.8209444687363359, + "train_speed(iter/s)": 0.060849 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.9813061356544495, + "learning_rate": 3.846153846153846e-05, + "loss": 1.1813791990280151, + "memory(GiB)": 22.77, + "step": 5, + "token_acc": 0.7339241207421767, + "train_speed(iter/s)": 0.161645 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.32120126485824585, + "learning_rate": 7.692307692307693e-05, + "loss": 0.9473533630371094, + "memory(GiB)": 29.22, + "step": 10, + "token_acc": 0.7654348805768364, + "train_speed(iter/s)": 0.205858 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.44191011786460876, + "learning_rate": 9.99816643111642e-05, + "loss": 0.893269157409668, + "memory(GiB)": 35.7, + "step": 15, + "token_acc": 0.77, + "train_speed(iter/s)": 0.226791 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.21858885884284973, + "learning_rate": 9.977554222133292e-05, + "loss": 0.7626124858856201, + "memory(GiB)": 35.71, + "step": 20, + "token_acc": 0.7875563297610482, + "train_speed(iter/s)": 0.239697 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.9943904280662537, + "eval_runtime": 0.6729, + "eval_samples_per_second": 5.945, + "eval_steps_per_second": 5.945, + "eval_token_acc": 0.6712598425196851, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.157553568482399, + "learning_rate": 9.934132612707632e-05, + "loss": 0.7563517093658447, + "memory(GiB)": 45.11, + "step": 25, + "token_acc": 0.7827733080927591, + "train_speed(iter/s)": 0.24361 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.1752931773662567, + "learning_rate": 9.868100580255466e-05, + "loss": 0.85587158203125, + "memory(GiB)": 45.11, + "step": 30, + "token_acc": 0.7533065615444129, + "train_speed(iter/s)": 0.250218 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.19386179745197296, + "learning_rate": 9.779760713358059e-05, + "loss": 0.7806258201599121, + "memory(GiB)": 45.11, + "step": 35, + "token_acc": 0.7831725282705675, + "train_speed(iter/s)": 0.25421 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.18464191257953644, + "learning_rate": 9.669517825164434e-05, + "loss": 0.6546980857849121, + "memory(GiB)": 45.11, + "step": 40, + "token_acc": 0.8115121090835086, + "train_speed(iter/s)": 0.258062 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.7608067989349365, + "eval_runtime": 0.7105, + "eval_samples_per_second": 5.629, + "eval_steps_per_second": 5.629, + "eval_token_acc": 0.6811023622047244, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.16011430323123932, + "learning_rate": 9.537877098354786e-05, + "loss": 0.6817403316497803, + "memory(GiB)": 45.11, + "step": 45, + "token_acc": 0.7932936565539266, + "train_speed(iter/s)": 0.260012 + }, + { + "epoch": 1.0, + "grad_norm": 0.1281077116727829, + "learning_rate": 9.385441770165385e-05, + "loss": 0.8535506248474121, + "memory(GiB)": 45.11, + "step": 50, + "token_acc": 0.7650505932881998, + "train_speed(iter/s)": 0.265003 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.17014774680137634, + "learning_rate": 9.212910368083245e-05, + "loss": 0.8319255828857421, + "memory(GiB)": 45.11, + "step": 55, + "token_acc": 0.7755898436497137, + "train_speed(iter/s)": 0.266811 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.1714233011007309, + "learning_rate": 9.021073508877845e-05, + "loss": 0.7703649520874023, + "memory(GiB)": 45.11, + "step": 60, + "token_acc": 0.7764240128323265, + "train_speed(iter/s)": 0.267776 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.7232142090797424, + "eval_runtime": 0.6604, + "eval_samples_per_second": 6.057, + "eval_steps_per_second": 6.057, + "eval_token_acc": 0.6909448818897638, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.2151585817337036, + "learning_rate": 8.810810275638183e-05, + "loss": 0.8008210182189941, + "memory(GiB)": 45.11, + "step": 65, + "token_acc": 0.7724902939545203, + "train_speed(iter/s)": 0.268541 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15938813984394073, + "learning_rate": 8.583084189417224e-05, + "loss": 0.5416983127593994, + "memory(GiB)": 45.11, + "step": 70, + "token_acc": 0.8375, + "train_speed(iter/s)": 0.270264 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.15911762416362762, + "learning_rate": 8.338938793943478e-05, + "loss": 0.749873161315918, + "memory(GiB)": 45.11, + "step": 75, + "token_acc": 0.7955103810395491, + "train_speed(iter/s)": 0.27108 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.28179508447647095, + "learning_rate": 8.079492873632554e-05, + "loss": 0.766708517074585, + "memory(GiB)": 45.11, + "step": 80, + "token_acc": 0.773585676913015, + "train_speed(iter/s)": 0.272369 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.6973368525505066, + "eval_runtime": 0.768, + "eval_samples_per_second": 5.208, + "eval_steps_per_second": 5.208, + "eval_token_acc": 0.6988188976377953, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.1738196760416031, + "learning_rate": 7.805935326811912e-05, + "loss": 0.6776129245758057, + "memory(GiB)": 45.11, + "step": 85, + "token_acc": 0.8014861222408393, + "train_speed(iter/s)": 0.272509 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.1992630660533905, + "learning_rate": 7.519519717652039e-05, + "loss": 0.6742076396942138, + "memory(GiB)": 45.11, + "step": 90, + "token_acc": 0.8044849334267694, + "train_speed(iter/s)": 0.273947 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.14516697824001312, + "learning_rate": 7.221558531769519e-05, + "loss": 0.6686168670654297, + "memory(GiB)": 45.11, + "step": 95, + "token_acc": 0.8059464816650148, + "train_speed(iter/s)": 0.274694 + }, + { + "epoch": 2.0, + "grad_norm": 0.30001300573349, + "learning_rate": 6.91341716182545e-05, + "loss": 0.6593320369720459, + "memory(GiB)": 45.11, + "step": 100, + "token_acc": 0.8248736639685166, + "train_speed(iter/s)": 0.276767 + }, + { + "epoch": 2.0, + "eval_loss": 0.669586181640625, + "eval_runtime": 0.7354, + "eval_samples_per_second": 5.439, + "eval_steps_per_second": 5.439, + "eval_token_acc": 0.6978346456692913, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.22456850111484528, + "learning_rate": 6.5965076506799e-05, + "loss": 0.5989685535430909, + "memory(GiB)": 45.11, + "step": 105, + "token_acc": 0.8109289125146409, + "train_speed(iter/s)": 0.276373 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.16906557977199554, + "learning_rate": 6.272282220774091e-05, + "loss": 0.6554426670074462, + "memory(GiB)": 45.11, + "step": 110, + "token_acc": 0.8035601165589763, + "train_speed(iter/s)": 0.276792 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.20648974180221558, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.6424230575561524, + "memory(GiB)": 45.11, + "step": 115, + "token_acc": 0.8126356402218471, + "train_speed(iter/s)": 0.277166 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.2807007133960724, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.611571216583252, + "memory(GiB)": 45.11, + "step": 120, + "token_acc": 0.831907214404477, + "train_speed(iter/s)": 0.277492 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.6545714735984802, + "eval_runtime": 0.6819, + "eval_samples_per_second": 5.866, + "eval_steps_per_second": 5.866, + "eval_token_acc": 0.7007874015748031, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.24286441504955292, + "learning_rate": 5.270694542927088e-05, + "loss": 0.5994139671325683, + "memory(GiB)": 45.11, + "step": 125, + "token_acc": 0.8090104642179928, + "train_speed(iter/s)": 0.277135 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.21559028327465057, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.7578139781951905, + "memory(GiB)": 45.11, + "step": 130, + "token_acc": 0.7908861533756404, + "train_speed(iter/s)": 0.277505 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.23157109320163727, + "learning_rate": 4.594206372362845e-05, + "loss": 0.5823601722717285, + "memory(GiB)": 45.11, + "step": 135, + "token_acc": 0.8350229562752288, + "train_speed(iter/s)": 0.277732 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.20862756669521332, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.680837869644165, + "memory(GiB)": 45.11, + "step": 140, + "token_acc": 0.7985224101132818, + "train_speed(iter/s)": 0.277895 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.6512690782546997, + "eval_runtime": 0.6556, + "eval_samples_per_second": 6.101, + "eval_steps_per_second": 6.101, + "eval_token_acc": 0.7007874015748031, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.24315312504768372, + "learning_rate": 3.92514779894488e-05, + "loss": 0.7217820644378662, + "memory(GiB)": 45.11, + "step": 145, + "token_acc": 0.7932356449139547, + "train_speed(iter/s)": 0.277709 + }, + { + "epoch": 3.0, + "grad_norm": 0.36581990122795105, + "learning_rate": 3.597244112544208e-05, + "loss": 0.7646692276000977, + "memory(GiB)": 45.11, + "step": 150, + "token_acc": 0.7856833347777104, + "train_speed(iter/s)": 0.279058 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.3223930895328522, + "learning_rate": 3.275768486860149e-05, + "loss": 0.7542715549468995, + "memory(GiB)": 45.11, + "step": 155, + "token_acc": 0.7917875550397652, + "train_speed(iter/s)": 0.279417 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.18214310705661774, + "learning_rate": 2.962194068331996e-05, + "loss": 0.6093906402587891, + "memory(GiB)": 45.11, + "step": 160, + "token_acc": 0.8244665896098217, + "train_speed(iter/s)": 0.279613 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.6415175199508667, + "eval_runtime": 0.6171, + "eval_samples_per_second": 6.482, + "eval_steps_per_second": 6.482, + "eval_token_acc": 0.6998031496062992, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.2996014356613159, + "learning_rate": 2.65795779650105e-05, + "loss": 0.5636696338653564, + "memory(GiB)": 45.11, + "step": 165, + "token_acc": 0.8361542855424947, + "train_speed(iter/s)": 0.279575 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.26891595125198364, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.6497210025787353, + "memory(GiB)": 45.11, + "step": 170, + "token_acc": 0.8024000617379226, + "train_speed(iter/s)": 0.279738 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.29831352829933167, + "learning_rate": 2.08302710446253e-05, + "loss": 0.6274948596954346, + "memory(GiB)": 45.11, + "step": 175, + "token_acc": 0.8227623198122695, + "train_speed(iter/s)": 0.280045 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.3759668469429016, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.557886791229248, + "memory(GiB)": 45.11, + "step": 180, + "token_acc": 0.8270365200162111, + "train_speed(iter/s)": 0.280492 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.6439455151557922, + "eval_runtime": 0.6302, + "eval_samples_per_second": 6.347, + "eval_steps_per_second": 6.347, + "eval_token_acc": 0.7007874015748031, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.2696785032749176, + "learning_rate": 1.561502705732883e-05, + "loss": 0.606973934173584, + "memory(GiB)": 45.11, + "step": 185, + "token_acc": 0.8181412108037283, + "train_speed(iter/s)": 0.280333 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.1801028549671173, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.6212857723236084, + "memory(GiB)": 45.11, + "step": 190, + "token_acc": 0.8139715818353057, + "train_speed(iter/s)": 0.280597 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.33335137367248535, + "learning_rate": 1.102933089792042e-05, + "loss": 0.6672876834869385, + "memory(GiB)": 45.11, + "step": 195, + "token_acc": 0.8081573933667716, + "train_speed(iter/s)": 0.280632 + }, + { + "epoch": 4.0, + "grad_norm": 0.34127023816108704, + "learning_rate": 8.999294173332058e-06, + "loss": 0.5871144294738769, + "memory(GiB)": 45.11, + "step": 200, + "token_acc": 0.8203874623046696, + "train_speed(iter/s)": 0.281583 + }, + { + "epoch": 4.0, + "eval_loss": 0.6337329745292664, + "eval_runtime": 0.62, + "eval_samples_per_second": 6.452, + "eval_steps_per_second": 6.452, + "eval_token_acc": 0.6988188976377953, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.43939781188964844, + "learning_rate": 7.157141191620548e-06, + "loss": 0.5270902633666992, + "memory(GiB)": 45.11, + "step": 205, + "token_acc": 0.8366300366300367, + "train_speed(iter/s)": 0.28149 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.2254554033279419, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.7644347667694091, + "memory(GiB)": 45.11, + "step": 210, + "token_acc": 0.7781070870244919, + "train_speed(iter/s)": 0.281717 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.20229189097881317, + "learning_rate": 4.069353111818913e-06, + "loss": 0.6509243011474609, + "memory(GiB)": 45.11, + "step": 215, + "token_acc": 0.815105591421381, + "train_speed(iter/s)": 0.281915 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.33297428488731384, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.651677417755127, + "memory(GiB)": 45.11, + "step": 220, + "token_acc": 0.808887518415733, + "train_speed(iter/s)": 0.282131 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.6320821046829224, + "eval_runtime": 0.5918, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 6.759, + "eval_token_acc": 0.6978346456692913, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.29022741317749023, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.572452449798584, + "memory(GiB)": 45.11, + "step": 225, + "token_acc": 0.8266844817713304, + "train_speed(iter/s)": 0.28185 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.2930595278739929, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.5687718391418457, + "memory(GiB)": 45.11, + "step": 230, + "token_acc": 0.8347073874020225, + "train_speed(iter/s)": 0.281839 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.23262919485569, + "learning_rate": 4.577201710596612e-07, + "loss": 0.5446483612060546, + "memory(GiB)": 45.11, + "step": 235, + "token_acc": 0.8374695424600668, + "train_speed(iter/s)": 0.282162 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.25554314255714417, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.5697183609008789, + "memory(GiB)": 45.11, + "step": 240, + "token_acc": 0.8293924231593995, + "train_speed(iter/s)": 0.282314 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.6318163871765137, + "eval_runtime": 0.7091, + "eval_samples_per_second": 5.641, + "eval_steps_per_second": 5.641, + "eval_token_acc": 0.6978346456692913, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3888534293467136e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/training_args.bin b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08f486ce4f22a45b4bd9d2c667024071bb3a2bb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde752d8cf6dea519be06cca689108d28e1bedd09e89f70bdb83f0b05163a7e6 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/README.md b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd09c386589f56a7693d9a0c36a3c04ec1f0b009 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a5865f2203e9f5deb586ec0045e256414b71aa9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..631d716df611e832c537c2ce232d7ce7430e8daa --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:665e71dab4936ce472113c1e593d403f6a446b56f790d4087ee1f6f3dd48e3d0 +size 73911112 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/additional_config.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/args.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/args.json new file mode 100644 index 0000000000000000000000000000000000000000..a0eafede89bbb0ea69bffca2a0848ff566ad4e21 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/optimizer.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5554b76757846ad655a0d0e555c0be2d32a5cfa --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e9d83591991859a319cb7442acf4d52c69ae642e44db32d28511242895e958 +size 148047722 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/rng_state.pth b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9970f4cd29a679d6a260485863e04fc3b3ec444 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20bc9ee8e5750029d14c52d4e56689422886c81ebccf9be6c969a11f387718af +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/scheduler.pt b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c07b610e877e513fda3813a64af716a38654c2f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ba022925b97a0c60fdb73ede217e52b3b55c5065f112ff19fea77b6a69dd5d +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/trainer_state.json b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c70be455c6e2f78430e7f82871b8e7a156b2a8d1 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/trainer_state.json @@ -0,0 +1,650 @@ +{ + "best_metric": 0.63124788, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245", + "epoch": 4.909090909090909, + "eval_steps": 20, + "global_step": 245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.3297547698020935, + "learning_rate": 7.692307692307694e-06, + "loss": 0.7305145859718323, + "memory(GiB)": 12.91, + "step": 1, + "token_acc": 0.8209444687363359, + "train_speed(iter/s)": 0.060849 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.9813061356544495, + "learning_rate": 3.846153846153846e-05, + "loss": 1.1813791990280151, + "memory(GiB)": 22.77, + "step": 5, + "token_acc": 0.7339241207421767, + "train_speed(iter/s)": 0.161645 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.32120126485824585, + "learning_rate": 7.692307692307693e-05, + "loss": 0.9473533630371094, + "memory(GiB)": 29.22, + "step": 10, + "token_acc": 0.7654348805768364, + "train_speed(iter/s)": 0.205858 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.44191011786460876, + "learning_rate": 9.99816643111642e-05, + "loss": 0.893269157409668, + "memory(GiB)": 35.7, + "step": 15, + "token_acc": 0.77, + "train_speed(iter/s)": 0.226791 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.21858885884284973, + "learning_rate": 9.977554222133292e-05, + "loss": 0.7626124858856201, + "memory(GiB)": 35.71, + "step": 20, + "token_acc": 0.7875563297610482, + "train_speed(iter/s)": 0.239697 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.9943904280662537, + "eval_runtime": 0.6729, + "eval_samples_per_second": 5.945, + "eval_steps_per_second": 5.945, + "eval_token_acc": 0.6712598425196851, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.157553568482399, + "learning_rate": 9.934132612707632e-05, + "loss": 0.7563517093658447, + "memory(GiB)": 45.11, + "step": 25, + "token_acc": 0.7827733080927591, + "train_speed(iter/s)": 0.24361 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.1752931773662567, + "learning_rate": 9.868100580255466e-05, + "loss": 0.85587158203125, + "memory(GiB)": 45.11, + "step": 30, + "token_acc": 0.7533065615444129, + "train_speed(iter/s)": 0.250218 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.19386179745197296, + "learning_rate": 9.779760713358059e-05, + "loss": 0.7806258201599121, + "memory(GiB)": 45.11, + "step": 35, + "token_acc": 0.7831725282705675, + "train_speed(iter/s)": 0.25421 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.18464191257953644, + "learning_rate": 9.669517825164434e-05, + "loss": 0.6546980857849121, + "memory(GiB)": 45.11, + "step": 40, + "token_acc": 0.8115121090835086, + "train_speed(iter/s)": 0.258062 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.7608067989349365, + "eval_runtime": 0.7105, + "eval_samples_per_second": 5.629, + "eval_steps_per_second": 5.629, + "eval_token_acc": 0.6811023622047244, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.16011430323123932, + "learning_rate": 9.537877098354786e-05, + "loss": 0.6817403316497803, + "memory(GiB)": 45.11, + "step": 45, + "token_acc": 0.7932936565539266, + "train_speed(iter/s)": 0.260012 + }, + { + "epoch": 1.0, + "grad_norm": 0.1281077116727829, + "learning_rate": 9.385441770165385e-05, + "loss": 0.8535506248474121, + "memory(GiB)": 45.11, + "step": 50, + "token_acc": 0.7650505932881998, + "train_speed(iter/s)": 0.265003 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.17014774680137634, + "learning_rate": 9.212910368083245e-05, + "loss": 0.8319255828857421, + "memory(GiB)": 45.11, + "step": 55, + "token_acc": 0.7755898436497137, + "train_speed(iter/s)": 0.266811 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.1714233011007309, + "learning_rate": 9.021073508877845e-05, + "loss": 0.7703649520874023, + "memory(GiB)": 45.11, + "step": 60, + "token_acc": 0.7764240128323265, + "train_speed(iter/s)": 0.267776 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.7232142090797424, + "eval_runtime": 0.6604, + "eval_samples_per_second": 6.057, + "eval_steps_per_second": 6.057, + "eval_token_acc": 0.6909448818897638, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.2151585817337036, + "learning_rate": 8.810810275638183e-05, + "loss": 0.8008210182189941, + "memory(GiB)": 45.11, + "step": 65, + "token_acc": 0.7724902939545203, + "train_speed(iter/s)": 0.268541 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15938813984394073, + "learning_rate": 8.583084189417224e-05, + "loss": 0.5416983127593994, + "memory(GiB)": 45.11, + "step": 70, + "token_acc": 0.8375, + "train_speed(iter/s)": 0.270264 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.15911762416362762, + "learning_rate": 8.338938793943478e-05, + "loss": 0.749873161315918, + "memory(GiB)": 45.11, + "step": 75, + "token_acc": 0.7955103810395491, + "train_speed(iter/s)": 0.27108 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.28179508447647095, + "learning_rate": 8.079492873632554e-05, + "loss": 0.766708517074585, + "memory(GiB)": 45.11, + "step": 80, + "token_acc": 0.773585676913015, + "train_speed(iter/s)": 0.272369 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.6973368525505066, + "eval_runtime": 0.768, + "eval_samples_per_second": 5.208, + "eval_steps_per_second": 5.208, + "eval_token_acc": 0.6988188976377953, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.1738196760416031, + "learning_rate": 7.805935326811912e-05, + "loss": 0.6776129245758057, + "memory(GiB)": 45.11, + "step": 85, + "token_acc": 0.8014861222408393, + "train_speed(iter/s)": 0.272509 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.1992630660533905, + "learning_rate": 7.519519717652039e-05, + "loss": 0.6742076396942138, + "memory(GiB)": 45.11, + "step": 90, + "token_acc": 0.8044849334267694, + "train_speed(iter/s)": 0.273947 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.14516697824001312, + "learning_rate": 7.221558531769519e-05, + "loss": 0.6686168670654297, + "memory(GiB)": 45.11, + "step": 95, + "token_acc": 0.8059464816650148, + "train_speed(iter/s)": 0.274694 + }, + { + "epoch": 2.0, + "grad_norm": 0.30001300573349, + "learning_rate": 6.91341716182545e-05, + "loss": 0.6593320369720459, + "memory(GiB)": 45.11, + "step": 100, + "token_acc": 0.8248736639685166, + "train_speed(iter/s)": 0.276767 + }, + { + "epoch": 2.0, + "eval_loss": 0.669586181640625, + "eval_runtime": 0.7354, + "eval_samples_per_second": 5.439, + "eval_steps_per_second": 5.439, + "eval_token_acc": 0.6978346456692913, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.22456850111484528, + "learning_rate": 6.5965076506799e-05, + "loss": 0.5989685535430909, + "memory(GiB)": 45.11, + "step": 105, + "token_acc": 0.8109289125146409, + "train_speed(iter/s)": 0.276373 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.16906557977199554, + "learning_rate": 6.272282220774091e-05, + "loss": 0.6554426670074462, + "memory(GiB)": 45.11, + "step": 110, + "token_acc": 0.8035601165589763, + "train_speed(iter/s)": 0.276792 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.20648974180221558, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.6424230575561524, + "memory(GiB)": 45.11, + "step": 115, + "token_acc": 0.8126356402218471, + "train_speed(iter/s)": 0.277166 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.2807007133960724, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.611571216583252, + "memory(GiB)": 45.11, + "step": 120, + "token_acc": 0.831907214404477, + "train_speed(iter/s)": 0.277492 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.6545714735984802, + "eval_runtime": 0.6819, + "eval_samples_per_second": 5.866, + "eval_steps_per_second": 5.866, + "eval_token_acc": 0.7007874015748031, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.24286441504955292, + "learning_rate": 5.270694542927088e-05, + "loss": 0.5994139671325683, + "memory(GiB)": 45.11, + "step": 125, + "token_acc": 0.8090104642179928, + "train_speed(iter/s)": 0.277135 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.21559028327465057, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.7578139781951905, + "memory(GiB)": 45.11, + "step": 130, + "token_acc": 0.7908861533756404, + "train_speed(iter/s)": 0.277505 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.23157109320163727, + "learning_rate": 4.594206372362845e-05, + "loss": 0.5823601722717285, + "memory(GiB)": 45.11, + "step": 135, + "token_acc": 0.8350229562752288, + "train_speed(iter/s)": 0.277732 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.20862756669521332, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.680837869644165, + "memory(GiB)": 45.11, + "step": 140, + "token_acc": 0.7985224101132818, + "train_speed(iter/s)": 0.277895 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.6512690782546997, + "eval_runtime": 0.6556, + "eval_samples_per_second": 6.101, + "eval_steps_per_second": 6.101, + "eval_token_acc": 0.7007874015748031, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.24315312504768372, + "learning_rate": 3.92514779894488e-05, + "loss": 0.7217820644378662, + "memory(GiB)": 45.11, + "step": 145, + "token_acc": 0.7932356449139547, + "train_speed(iter/s)": 0.277709 + }, + { + "epoch": 3.0, + "grad_norm": 0.36581990122795105, + "learning_rate": 3.597244112544208e-05, + "loss": 0.7646692276000977, + "memory(GiB)": 45.11, + "step": 150, + "token_acc": 0.7856833347777104, + "train_speed(iter/s)": 0.279058 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.3223930895328522, + "learning_rate": 3.275768486860149e-05, + "loss": 0.7542715549468995, + "memory(GiB)": 45.11, + "step": 155, + "token_acc": 0.7917875550397652, + "train_speed(iter/s)": 0.279417 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.18214310705661774, + "learning_rate": 2.962194068331996e-05, + "loss": 0.6093906402587891, + "memory(GiB)": 45.11, + "step": 160, + "token_acc": 0.8244665896098217, + "train_speed(iter/s)": 0.279613 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.6415175199508667, + "eval_runtime": 0.6171, + "eval_samples_per_second": 6.482, + "eval_steps_per_second": 6.482, + "eval_token_acc": 0.6998031496062992, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.2996014356613159, + "learning_rate": 2.65795779650105e-05, + "loss": 0.5636696338653564, + "memory(GiB)": 45.11, + "step": 165, + "token_acc": 0.8361542855424947, + "train_speed(iter/s)": 0.279575 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.26891595125198364, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.6497210025787353, + "memory(GiB)": 45.11, + "step": 170, + "token_acc": 0.8024000617379226, + "train_speed(iter/s)": 0.279738 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.29831352829933167, + "learning_rate": 2.08302710446253e-05, + "loss": 0.6274948596954346, + "memory(GiB)": 45.11, + "step": 175, + "token_acc": 0.8227623198122695, + "train_speed(iter/s)": 0.280045 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.3759668469429016, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.557886791229248, + "memory(GiB)": 45.11, + "step": 180, + "token_acc": 0.8270365200162111, + "train_speed(iter/s)": 0.280492 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.6439455151557922, + "eval_runtime": 0.6302, + "eval_samples_per_second": 6.347, + "eval_steps_per_second": 6.347, + "eval_token_acc": 0.7007874015748031, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.2696785032749176, + "learning_rate": 1.561502705732883e-05, + "loss": 0.606973934173584, + "memory(GiB)": 45.11, + "step": 185, + "token_acc": 0.8181412108037283, + "train_speed(iter/s)": 0.280333 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.1801028549671173, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.6212857723236084, + "memory(GiB)": 45.11, + "step": 190, + "token_acc": 0.8139715818353057, + "train_speed(iter/s)": 0.280597 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.33335137367248535, + "learning_rate": 1.102933089792042e-05, + "loss": 0.6672876834869385, + "memory(GiB)": 45.11, + "step": 195, + "token_acc": 0.8081573933667716, + "train_speed(iter/s)": 0.280632 + }, + { + "epoch": 4.0, + "grad_norm": 0.34127023816108704, + "learning_rate": 8.999294173332058e-06, + "loss": 0.5871144294738769, + "memory(GiB)": 45.11, + "step": 200, + "token_acc": 0.8203874623046696, + "train_speed(iter/s)": 0.281583 + }, + { + "epoch": 4.0, + "eval_loss": 0.6337329745292664, + "eval_runtime": 0.62, + "eval_samples_per_second": 6.452, + "eval_steps_per_second": 6.452, + "eval_token_acc": 0.6988188976377953, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.43939781188964844, + "learning_rate": 7.157141191620548e-06, + "loss": 0.5270902633666992, + "memory(GiB)": 45.11, + "step": 205, + "token_acc": 0.8366300366300367, + "train_speed(iter/s)": 0.28149 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.2254554033279419, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.7644347667694091, + "memory(GiB)": 45.11, + "step": 210, + "token_acc": 0.7781070870244919, + "train_speed(iter/s)": 0.281717 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.20229189097881317, + "learning_rate": 4.069353111818913e-06, + "loss": 0.6509243011474609, + "memory(GiB)": 45.11, + "step": 215, + "token_acc": 0.815105591421381, + "train_speed(iter/s)": 0.281915 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.33297428488731384, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.651677417755127, + "memory(GiB)": 45.11, + "step": 220, + "token_acc": 0.808887518415733, + "train_speed(iter/s)": 0.282131 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.6320821046829224, + "eval_runtime": 0.5918, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 6.759, + "eval_token_acc": 0.6978346456692913, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.29022741317749023, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.572452449798584, + "memory(GiB)": 45.11, + "step": 225, + "token_acc": 0.8266844817713304, + "train_speed(iter/s)": 0.28185 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.2930595278739929, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.5687718391418457, + "memory(GiB)": 45.11, + "step": 230, + "token_acc": 0.8347073874020225, + "train_speed(iter/s)": 0.281839 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.23262919485569, + "learning_rate": 4.577201710596612e-07, + "loss": 0.5446483612060546, + "memory(GiB)": 45.11, + "step": 235, + "token_acc": 0.8374695424600668, + "train_speed(iter/s)": 0.282162 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.25554314255714417, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.5697183609008789, + "memory(GiB)": 45.11, + "step": 240, + "token_acc": 0.8293924231593995, + "train_speed(iter/s)": 0.282314 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.6318163871765137, + "eval_runtime": 0.7091, + "eval_samples_per_second": 5.641, + "eval_steps_per_second": 5.641, + "eval_token_acc": 0.6978346456692913, + "step": 240 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.32179728150367737, + "learning_rate": 0.0, + "loss": 0.5548447608947754, + "memory(GiB)": 45.11, + "step": 245, + "token_acc": 0.8255539449161317, + "train_speed(iter/s)": 0.28211 + }, + { + "epoch": 4.909090909090909, + "eval_loss": 0.631247878074646, + "eval_runtime": 0.6316, + "eval_samples_per_second": 6.333, + "eval_steps_per_second": 6.333, + "eval_token_acc": 0.6968503937007874, + "step": 245 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4131540621191168e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/training_args.bin b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08f486ce4f22a45b4bd9d2c667024071bb3a2bb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde752d8cf6dea519be06cca689108d28e1bedd09e89f70bdb83f0b05163a7e6 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..54a85adf728720e3fc3298e205b7afe3b4e3b0db Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..ccc1bc515b39341511c52de61f46eaa7d29bb3a1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b3fad33638c1488d16ba68d30a2189eae931a25a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..0eaef1dc17181fdf5e087b2e4e0b71537fe2e1eb Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..f90d902394b2ec902b136a0fb0dc3b1ca222dc69 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..9cc2f5018175ad195aab7254264c4c8fdcaa870b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..f892264ae6fc999acc19bf71ff79b06d5394852b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..1a747bb27a6bf92800a1028c0eaaaa4305471d7f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..954d6c8372d597337cb8d7d579b4a95ecdef5362 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..bd1cf799c9e0a5c47e325aac4c560e86f98856d3 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..6dbd53db4208b835e4ef1c5363c9eef51b95c4a8 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..fe913b169190b15e0b27edd8f4d7e76a0b6b2052 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6bc392657ea2ef297a803100e6283ce4baece1db Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..b21d11544431c2e20528cd386e67419b6229589b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3e6e47b1f94d8cdb6433292f9e110aeed9a01224 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..df4afe08b7372bead7a0fea014556e5721ba26f1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..43a715c653ca4694a4fd623109b0b999a178930e Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/logging.jsonl b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47a3491bc8a90087efc17c0e93ad023820fffd5a --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/logging.jsonl @@ -0,0 +1,65 @@ +{"loss": 0.73051459, "token_acc": 0.82094447, "grad_norm": 0.32975477, "learning_rate": 7.69e-06, "memory(GiB)": 12.91, "train_speed(iter/s)": 0.060849, "epoch": 0.02020202, "global_step/max_steps": "1/245", "percentage": "0.41%", "elapsed_time": "16s", "remaining_time": "1h 5m 32s"} +{"loss": 1.1813792, "token_acc": 0.73392412, "grad_norm": 0.98130614, "learning_rate": 3.846e-05, "memory(GiB)": 22.77, "train_speed(iter/s)": 0.161645, "epoch": 0.1010101, "global_step/max_steps": "5/245", "percentage": "2.04%", "elapsed_time": "30s", "remaining_time": "24m 29s"} +{"loss": 0.94735336, "token_acc": 0.76543488, "grad_norm": 0.32120126, "learning_rate": 7.692e-05, "memory(GiB)": 29.22, "train_speed(iter/s)": 0.205858, "epoch": 0.2020202, "global_step/max_steps": "10/245", "percentage": "4.08%", "elapsed_time": "48s", "remaining_time": "18m 54s"} +{"loss": 0.89326916, "token_acc": 0.77, "grad_norm": 0.44191012, "learning_rate": 9.998e-05, "memory(GiB)": 35.7, "train_speed(iter/s)": 0.226791, "epoch": 0.3030303, "global_step/max_steps": "15/245", "percentage": "6.12%", "elapsed_time": "1m 5s", "remaining_time": "16m 49s"} +{"loss": 0.76261249, "token_acc": 0.78755633, "grad_norm": 0.21858886, "learning_rate": 9.978e-05, "memory(GiB)": 35.71, "train_speed(iter/s)": 0.239697, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "1m 23s", "remaining_time": "15m 35s"} +{"eval_loss": 0.99439043, "eval_token_acc": 0.67125984, "eval_runtime": 0.6729, "eval_samples_per_second": 5.945, "eval_steps_per_second": 5.945, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "1m 23s", "remaining_time": "15m 42s"} +{"loss": 0.75635171, "token_acc": 0.78277331, "grad_norm": 0.15755357, "learning_rate": 9.934e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.24361, "epoch": 0.50505051, "global_step/max_steps": "25/245", "percentage": "10.20%", "elapsed_time": "1m 42s", "remaining_time": "15m 0s"} +{"loss": 0.85587158, "token_acc": 0.75330656, "grad_norm": 0.17529318, "learning_rate": 9.868e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.250218, "epoch": 0.60606061, "global_step/max_steps": "30/245", "percentage": "12.24%", "elapsed_time": "1m 59s", "remaining_time": "14m 16s"} +{"loss": 0.78062582, "token_acc": 0.78317253, "grad_norm": 0.1938618, "learning_rate": 9.78e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.25421, "epoch": 0.70707071, "global_step/max_steps": "35/245", "percentage": "14.29%", "elapsed_time": "2m 17s", "remaining_time": "13m 44s"} +{"loss": 0.65469809, "token_acc": 0.81151211, "grad_norm": 0.18464191, "learning_rate": 9.67e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.258062, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "2m 34s", "remaining_time": "13m 12s"} +{"eval_loss": 0.7608068, "eval_token_acc": 0.68110236, "eval_runtime": 0.7105, "eval_samples_per_second": 5.629, "eval_steps_per_second": 5.629, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "2m 35s", "remaining_time": "13m 16s"} +{"loss": 0.68174033, "token_acc": 0.79329366, "grad_norm": 0.1601143, "learning_rate": 9.538e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.260012, "epoch": 0.90909091, "global_step/max_steps": "45/245", "percentage": "18.37%", "elapsed_time": "2m 52s", "remaining_time": "12m 47s"} +{"loss": 0.85355062, "token_acc": 0.76505059, "grad_norm": 0.12810771, "learning_rate": 9.385e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.265003, "epoch": 1.0, "global_step/max_steps": "50/245", "percentage": "20.41%", "elapsed_time": "3m 8s", "remaining_time": "12m 14s"} +{"loss": 0.83192558, "token_acc": 0.77558984, "grad_norm": 0.17014775, "learning_rate": 9.213e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.266811, "epoch": 1.1010101, "global_step/max_steps": "55/245", "percentage": "22.45%", "elapsed_time": "3m 25s", "remaining_time": "11m 51s"} +{"loss": 0.77036495, "token_acc": 0.77642401, "grad_norm": 0.1714233, "learning_rate": 9.021e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.267776, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "3m 43s", "remaining_time": "11m 29s"} +{"eval_loss": 0.72321421, "eval_token_acc": 0.69094488, "eval_runtime": 0.6604, "eval_samples_per_second": 6.057, "eval_steps_per_second": 6.057, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "3m 44s", "remaining_time": "11m 31s"} +{"loss": 0.80082102, "token_acc": 0.77249029, "grad_norm": 0.21515858, "learning_rate": 8.811e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.268541, "epoch": 1.3030303, "global_step/max_steps": "65/245", "percentage": "26.53%", "elapsed_time": "4m 1s", "remaining_time": "11m 9s"} +{"loss": 0.54169831, "token_acc": 0.8375, "grad_norm": 0.15938814, "learning_rate": 8.583e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.270264, "epoch": 1.4040404, "global_step/max_steps": "70/245", "percentage": "28.57%", "elapsed_time": "4m 18s", "remaining_time": "10m 46s"} +{"loss": 0.74987316, "token_acc": 0.79551038, "grad_norm": 0.15911762, "learning_rate": 8.339e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.27108, "epoch": 1.50505051, "global_step/max_steps": "75/245", "percentage": "30.61%", "elapsed_time": "4m 36s", "remaining_time": "10m 26s"} +{"loss": 0.76670852, "token_acc": 0.77358568, "grad_norm": 0.28179508, "learning_rate": 8.079e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.272369, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "4m 53s", "remaining_time": "10m 5s"} +{"eval_loss": 0.69733685, "eval_token_acc": 0.6988189, "eval_runtime": 0.768, "eval_samples_per_second": 5.208, "eval_steps_per_second": 5.208, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "4m 54s", "remaining_time": "10m 6s"} +{"loss": 0.67761292, "token_acc": 0.80148612, "grad_norm": 0.17381968, "learning_rate": 7.806e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.272509, "epoch": 1.70707071, "global_step/max_steps": "85/245", "percentage": "34.69%", "elapsed_time": "5m 11s", "remaining_time": "9m 46s"} +{"loss": 0.67420764, "token_acc": 0.80448493, "grad_norm": 0.19926307, "learning_rate": 7.52e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.273947, "epoch": 1.80808081, "global_step/max_steps": "90/245", "percentage": "36.73%", "elapsed_time": "5m 28s", "remaining_time": "9m 25s"} +{"loss": 0.66861687, "token_acc": 0.80594648, "grad_norm": 0.14516698, "learning_rate": 7.222e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.274694, "epoch": 1.90909091, "global_step/max_steps": "95/245", "percentage": "38.78%", "elapsed_time": "5m 45s", "remaining_time": "9m 5s"} +{"loss": 0.65933204, "token_acc": 0.82487366, "grad_norm": 0.30001301, "learning_rate": 6.913e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276767, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "6m 0s", "remaining_time": "8m 43s"} +{"eval_loss": 0.66958618, "eval_token_acc": 0.69783465, "eval_runtime": 0.7354, "eval_samples_per_second": 5.439, "eval_steps_per_second": 5.439, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "6m 1s", "remaining_time": "8m 44s"} +{"loss": 0.59896855, "token_acc": 0.81092891, "grad_norm": 0.2245685, "learning_rate": 6.597e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276373, "epoch": 2.1010101, "global_step/max_steps": "105/245", "percentage": "42.86%", "elapsed_time": "6m 19s", "remaining_time": "8m 26s"} +{"loss": 0.65544267, "token_acc": 0.80356012, "grad_norm": 0.16906558, "learning_rate": 6.272e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276792, "epoch": 2.2020202, "global_step/max_steps": "110/245", "percentage": "44.90%", "elapsed_time": "6m 37s", "remaining_time": "8m 7s"} +{"loss": 0.64242306, "token_acc": 0.81263564, "grad_norm": 0.20648974, "learning_rate": 5.942e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277166, "epoch": 2.3030303, "global_step/max_steps": "115/245", "percentage": "46.94%", "elapsed_time": "6m 54s", "remaining_time": "7m 48s"} +{"loss": 0.61157122, "token_acc": 0.83190721, "grad_norm": 0.28070071, "learning_rate": 5.608e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277492, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "7m 12s", "remaining_time": "7m 30s"} +{"eval_loss": 0.65457147, "eval_token_acc": 0.7007874, "eval_runtime": 0.6819, "eval_samples_per_second": 5.866, "eval_steps_per_second": 5.866, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "7m 12s", "remaining_time": "7m 30s"} +{"loss": 0.59941397, "token_acc": 0.80901046, "grad_norm": 0.24286442, "learning_rate": 5.271e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277135, "epoch": 2.50505051, "global_step/max_steps": "125/245", "percentage": "51.02%", "elapsed_time": "7m 30s", "remaining_time": "7m 12s"} +{"loss": 0.75781398, "token_acc": 0.79088615, "grad_norm": 0.21559028, "learning_rate": 4.932e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277505, "epoch": 2.60606061, "global_step/max_steps": "130/245", "percentage": "53.06%", "elapsed_time": "7m 48s", "remaining_time": "6m 54s"} +{"loss": 0.58236017, "token_acc": 0.83502296, "grad_norm": 0.23157109, "learning_rate": 4.594e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277732, "epoch": 2.70707071, "global_step/max_steps": "135/245", "percentage": "55.10%", "elapsed_time": "8m 5s", "remaining_time": "6m 35s"} +{"loss": 0.68083787, "token_acc": 0.79852241, "grad_norm": 0.20862757, "learning_rate": 4.258e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277895, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "8m 23s", "remaining_time": "6m 17s"} +{"eval_loss": 0.65126908, "eval_token_acc": 0.7007874, "eval_runtime": 0.6556, "eval_samples_per_second": 6.101, "eval_steps_per_second": 6.101, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "8m 24s", "remaining_time": "6m 18s"} +{"loss": 0.72178206, "token_acc": 0.79323564, "grad_norm": 0.24315313, "learning_rate": 3.925e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277709, "epoch": 2.90909091, "global_step/max_steps": "145/245", "percentage": "59.18%", "elapsed_time": "8m 41s", "remaining_time": "5m 59s"} +{"loss": 0.76466923, "token_acc": 0.78568333, "grad_norm": 0.3658199, "learning_rate": 3.597e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279058, "epoch": 3.0, "global_step/max_steps": "150/245", "percentage": "61.22%", "elapsed_time": "8m 57s", "remaining_time": "5m 40s"} +{"loss": 0.75427155, "token_acc": 0.79178756, "grad_norm": 0.32239309, "learning_rate": 3.276e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279417, "epoch": 3.1010101, "global_step/max_steps": "155/245", "percentage": "63.27%", "elapsed_time": "9m 14s", "remaining_time": "5m 21s"} +{"loss": 0.60939064, "token_acc": 0.82446659, "grad_norm": 0.18214311, "learning_rate": 2.962e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279613, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "9m 31s", "remaining_time": "5m 3s"} +{"eval_loss": 0.64151752, "eval_token_acc": 0.69980315, "eval_runtime": 0.6171, "eval_samples_per_second": 6.482, "eval_steps_per_second": 6.482, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "9m 32s", "remaining_time": "5m 4s"} +{"loss": 0.56366963, "token_acc": 0.83615429, "grad_norm": 0.29960144, "learning_rate": 2.658e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279575, "epoch": 3.3030303, "global_step/max_steps": "165/245", "percentage": "67.35%", "elapsed_time": "9m 49s", "remaining_time": "4m 45s"} +{"loss": 0.649721, "token_acc": 0.80240006, "grad_norm": 0.26891595, "learning_rate": 2.364e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279738, "epoch": 3.4040404, "global_step/max_steps": "170/245", "percentage": "69.39%", "elapsed_time": "10m 7s", "remaining_time": "4m 27s"} +{"loss": 0.62749486, "token_acc": 0.82276232, "grad_norm": 0.29831353, "learning_rate": 2.083e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280045, "epoch": 3.50505051, "global_step/max_steps": "175/245", "percentage": "71.43%", "elapsed_time": "10m 24s", "remaining_time": "4m 9s"} +{"loss": 0.55788679, "token_acc": 0.82703652, "grad_norm": 0.37596685, "learning_rate": 1.815e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280492, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "10m 41s", "remaining_time": "3m 51s"} +{"eval_loss": 0.64394552, "eval_token_acc": 0.7007874, "eval_runtime": 0.6302, "eval_samples_per_second": 6.347, "eval_steps_per_second": 6.347, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "10m 42s", "remaining_time": "3m 51s"} +{"loss": 0.60697393, "token_acc": 0.81814121, "grad_norm": 0.2696785, "learning_rate": 1.562e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280333, "epoch": 3.70707071, "global_step/max_steps": "185/245", "percentage": "75.51%", "elapsed_time": "10m 59s", "remaining_time": "3m 33s"} +{"loss": 0.62128577, "token_acc": 0.81397158, "grad_norm": 0.18010285, "learning_rate": 1.324e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280597, "epoch": 3.80808081, "global_step/max_steps": "190/245", "percentage": "77.55%", "elapsed_time": "11m 16s", "remaining_time": "3m 15s"} +{"loss": 0.66728768, "token_acc": 0.80815739, "grad_norm": 0.33335137, "learning_rate": 1.103e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280632, "epoch": 3.90909091, "global_step/max_steps": "195/245", "percentage": "79.59%", "elapsed_time": "11m 34s", "remaining_time": "2m 58s"} +{"loss": 0.58711443, "token_acc": 0.82038746, "grad_norm": 0.34127024, "learning_rate": 9e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281583, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "11m 49s", "remaining_time": "2m 39s"} +{"eval_loss": 0.63373297, "eval_token_acc": 0.6988189, "eval_runtime": 0.62, "eval_samples_per_second": 6.452, "eval_steps_per_second": 6.452, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "11m 50s", "remaining_time": "2m 39s"} +{"loss": 0.52709026, "token_acc": 0.83663004, "grad_norm": 0.43939781, "learning_rate": 7.16e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28149, "epoch": 4.1010101, "global_step/max_steps": "205/245", "percentage": "83.67%", "elapsed_time": "12m 7s", "remaining_time": "2m 22s"} +{"loss": 0.76443477, "token_acc": 0.77810709, "grad_norm": 0.2254554, "learning_rate": 5.51e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281717, "epoch": 4.2020202, "global_step/max_steps": "210/245", "percentage": "85.71%", "elapsed_time": "12m 25s", "remaining_time": "2m 4s"} +{"loss": 0.6509243, "token_acc": 0.81510559, "grad_norm": 0.20229189, "learning_rate": 4.07e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281915, "epoch": 4.3030303, "global_step/max_steps": "215/245", "percentage": "87.76%", "elapsed_time": "12m 42s", "remaining_time": "1m 46s"} +{"loss": 0.65167742, "token_acc": 0.80888752, "grad_norm": 0.33297428, "learning_rate": 2.84e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282131, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "12m 59s", "remaining_time": "1m 28s"} +{"eval_loss": 0.6320821, "eval_token_acc": 0.69783465, "eval_runtime": 0.5918, "eval_samples_per_second": 6.759, "eval_steps_per_second": 6.759, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "13m 0s", "remaining_time": "1m 28s"} +{"loss": 0.57245245, "token_acc": 0.82668448, "grad_norm": 0.29022741, "learning_rate": 1.82e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28185, "epoch": 4.50505051, "global_step/max_steps": "225/245", "percentage": "91.84%", "elapsed_time": "13m 17s", "remaining_time": "1m 10s"} +{"loss": 0.56877184, "token_acc": 0.83470739, "grad_norm": 0.29305953, "learning_rate": 1.03e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281839, "epoch": 4.60606061, "global_step/max_steps": "230/245", "percentage": "93.88%", "elapsed_time": "13m 35s", "remaining_time": "53s"} +{"loss": 0.54464836, "token_acc": 0.83746954, "grad_norm": 0.23262919, "learning_rate": 4.6e-07, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282162, "epoch": 4.70707071, "global_step/max_steps": "235/245", "percentage": "95.92%", "elapsed_time": "13m 52s", "remaining_time": "35s"} +{"loss": 0.56971836, "token_acc": 0.82939242, "grad_norm": 0.25554314, "learning_rate": 1.1e-07, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282314, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "14m 9s", "remaining_time": "17s"} +{"eval_loss": 0.63181639, "eval_token_acc": 0.69783465, "eval_runtime": 0.7091, "eval_samples_per_second": 5.641, "eval_steps_per_second": 5.641, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "14m 10s", "remaining_time": "17s"} +{"loss": 0.55484476, "token_acc": 0.82555394, "grad_norm": 0.32179728, "learning_rate": 0.0, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28211, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 28s", "remaining_time": "0s"} +{"eval_loss": 0.63124788, "eval_token_acc": 0.69685039, "eval_runtime": 0.6316, "eval_samples_per_second": 6.333, "eval_steps_per_second": 6.333, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 28s", "remaining_time": "0s"} +{"train_runtime": 869.3128, "train_samples_per_second": 2.278, "train_steps_per_second": 0.282, "total_flos": 1.4131540621191168e+16, "train_loss": 0.68741596, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "14m 29s", "remaining_time": "0s"} +{"train_dataset": "775.398990±644.578527, min=41.000000, max=4149.000000, size=396", "val_dataset": "311.500000±316.897854, min=85.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1795.5528M Params (18.4648M Trainable [1.0284%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/checkpoint-245", "best_metric": 0.63124788, "global_step": 245, "log_history": [{"loss": 0.7305145859718323, "token_acc": 0.8209444687363359, "grad_norm": 0.3297547698020935, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 12.91, "train_speed(iter/s)": 0.060849, "epoch": 0.020202020202020204, "step": 1}, {"loss": 1.1813791990280151, "token_acc": 0.7339241207421767, "grad_norm": 0.9813061356544495, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 22.77, "train_speed(iter/s)": 0.161645, "epoch": 0.10101010101010101, "step": 5}, {"loss": 0.9473533630371094, "token_acc": 0.7654348805768364, "grad_norm": 0.32120126485824585, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 29.22, "train_speed(iter/s)": 0.205858, "epoch": 0.20202020202020202, "step": 10}, {"loss": 0.893269157409668, "token_acc": 0.77, "grad_norm": 0.44191011786460876, "learning_rate": 9.99816643111642e-05, "memory(GiB)": 35.7, "train_speed(iter/s)": 0.226791, "epoch": 0.30303030303030304, "step": 15}, {"loss": 0.7626124858856201, "token_acc": 0.7875563297610482, "grad_norm": 0.21858885884284973, "learning_rate": 9.977554222133292e-05, "memory(GiB)": 35.71, "train_speed(iter/s)": 0.239697, "epoch": 0.40404040404040403, "step": 20}, {"eval_loss": 0.9943904280662537, "eval_token_acc": 0.6712598425196851, "eval_runtime": 0.6729, "eval_samples_per_second": 5.945, "eval_steps_per_second": 5.945, "epoch": 0.40404040404040403, "step": 20}, {"loss": 0.7563517093658447, "token_acc": 0.7827733080927591, "grad_norm": 0.157553568482399, "learning_rate": 9.934132612707632e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.24361, "epoch": 0.5050505050505051, "step": 25}, {"loss": 0.85587158203125, "token_acc": 0.7533065615444129, "grad_norm": 0.1752931773662567, "learning_rate": 9.868100580255466e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.250218, "epoch": 0.6060606060606061, "step": 30}, {"loss": 0.7806258201599121, "token_acc": 0.7831725282705675, "grad_norm": 0.19386179745197296, "learning_rate": 9.779760713358059e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.25421, "epoch": 0.7070707070707071, "step": 35}, {"loss": 0.6546980857849121, "token_acc": 0.8115121090835086, "grad_norm": 0.18464191257953644, "learning_rate": 9.669517825164434e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.258062, "epoch": 0.8080808080808081, "step": 40}, {"eval_loss": 0.7608067989349365, "eval_token_acc": 0.6811023622047244, "eval_runtime": 0.7105, "eval_samples_per_second": 5.629, "eval_steps_per_second": 5.629, "epoch": 0.8080808080808081, "step": 40}, {"loss": 0.6817403316497803, "token_acc": 0.7932936565539266, "grad_norm": 0.16011430323123932, "learning_rate": 9.537877098354786e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.260012, "epoch": 0.9090909090909091, "step": 45}, {"loss": 0.8535506248474121, "token_acc": 0.7650505932881998, "grad_norm": 0.1281077116727829, "learning_rate": 9.385441770165385e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.265003, "epoch": 1.0, "step": 50}, {"loss": 0.8319255828857421, "token_acc": 0.7755898436497137, "grad_norm": 0.17014774680137634, "learning_rate": 9.212910368083245e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.266811, "epoch": 1.101010101010101, "step": 55}, {"loss": 0.7703649520874023, "token_acc": 0.7764240128323265, "grad_norm": 0.1714233011007309, "learning_rate": 9.021073508877845e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.267776, "epoch": 1.202020202020202, "step": 60}, {"eval_loss": 0.7232142090797424, "eval_token_acc": 0.6909448818897638, "eval_runtime": 0.6604, "eval_samples_per_second": 6.057, "eval_steps_per_second": 6.057, "epoch": 1.202020202020202, "step": 60}, {"loss": 0.8008210182189941, "token_acc": 0.7724902939545203, "grad_norm": 0.2151585817337036, "learning_rate": 8.810810275638183e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.268541, "epoch": 1.303030303030303, "step": 65}, {"loss": 0.5416983127593994, "token_acc": 0.8375, "grad_norm": 0.15938813984394073, "learning_rate": 8.583084189417224e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.270264, "epoch": 1.404040404040404, "step": 70}, {"loss": 0.749873161315918, "token_acc": 0.7955103810395491, "grad_norm": 0.15911762416362762, "learning_rate": 8.338938793943478e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.27108, "epoch": 1.5050505050505052, "step": 75}, {"loss": 0.766708517074585, "token_acc": 0.773585676913015, "grad_norm": 0.28179508447647095, "learning_rate": 8.079492873632554e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.272369, "epoch": 1.606060606060606, "step": 80}, {"eval_loss": 0.6973368525505066, "eval_token_acc": 0.6988188976377953, "eval_runtime": 0.768, "eval_samples_per_second": 5.208, "eval_steps_per_second": 5.208, "epoch": 1.606060606060606, "step": 80}, {"loss": 0.6776129245758057, "token_acc": 0.8014861222408393, "grad_norm": 0.1738196760416031, "learning_rate": 7.805935326811912e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.272509, "epoch": 1.7070707070707072, "step": 85}, {"loss": 0.6742076396942138, "token_acc": 0.8044849334267694, "grad_norm": 0.1992630660533905, "learning_rate": 7.519519717652039e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.273947, "epoch": 1.808080808080808, "step": 90}, {"loss": 0.6686168670654297, "token_acc": 0.8059464816650148, "grad_norm": 0.14516697824001312, "learning_rate": 7.221558531769519e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.274694, "epoch": 1.9090909090909092, "step": 95}, {"loss": 0.6593320369720459, "token_acc": 0.8248736639685166, "grad_norm": 0.30001300573349, "learning_rate": 6.91341716182545e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276767, "epoch": 2.0, "step": 100}, {"eval_loss": 0.669586181640625, "eval_token_acc": 0.6978346456692913, "eval_runtime": 0.7354, "eval_samples_per_second": 5.439, "eval_steps_per_second": 5.439, "epoch": 2.0, "step": 100}, {"loss": 0.5989685535430909, "token_acc": 0.8109289125146409, "grad_norm": 0.22456850111484528, "learning_rate": 6.5965076506799e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276373, "epoch": 2.101010101010101, "step": 105}, {"loss": 0.6554426670074462, "token_acc": 0.8035601165589763, "grad_norm": 0.16906557977199554, "learning_rate": 6.272282220774091e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.276792, "epoch": 2.202020202020202, "step": 110}, {"loss": 0.6424230575561524, "token_acc": 0.8126356402218471, "grad_norm": 0.20648974180221558, "learning_rate": 5.9422266193915924e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277166, "epoch": 2.303030303030303, "step": 115}, {"loss": 0.611571216583252, "token_acc": 0.831907214404477, "grad_norm": 0.2807007133960724, "learning_rate": 5.6078533102935745e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277492, "epoch": 2.404040404040404, "step": 120}, {"eval_loss": 0.6545714735984802, "eval_token_acc": 0.7007874015748031, "eval_runtime": 0.6819, "eval_samples_per_second": 5.866, "eval_steps_per_second": 5.866, "epoch": 2.404040404040404, "step": 120}, {"loss": 0.5994139671325683, "token_acc": 0.8090104642179928, "grad_norm": 0.24286441504955292, "learning_rate": 5.270694542927088e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277135, "epoch": 2.505050505050505, "step": 125}, {"loss": 0.7578139781951905, "token_acc": 0.7908861533756404, "grad_norm": 0.21559028327465057, "learning_rate": 4.9322953309663916e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277505, "epoch": 2.606060606060606, "step": 130}, {"loss": 0.5823601722717285, "token_acc": 0.8350229562752288, "grad_norm": 0.23157109320163727, "learning_rate": 4.594206372362845e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277732, "epoch": 2.707070707070707, "step": 135}, {"loss": 0.680837869644165, "token_acc": 0.7985224101132818, "grad_norm": 0.20862756669521332, "learning_rate": 4.2579769433468694e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277895, "epoch": 2.808080808080808, "step": 140}, {"eval_loss": 0.6512690782546997, "eval_token_acc": 0.7007874015748031, "eval_runtime": 0.6556, "eval_samples_per_second": 6.101, "eval_steps_per_second": 6.101, "epoch": 2.808080808080808, "step": 140}, {"loss": 0.7217820644378662, "token_acc": 0.7932356449139547, "grad_norm": 0.24315312504768372, "learning_rate": 3.92514779894488e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.277709, "epoch": 2.909090909090909, "step": 145}, {"loss": 0.7646692276000977, "token_acc": 0.7856833347777104, "grad_norm": 0.36581990122795105, "learning_rate": 3.597244112544208e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279058, "epoch": 3.0, "step": 150}, {"loss": 0.7542715549468995, "token_acc": 0.7917875550397652, "grad_norm": 0.3223930895328522, "learning_rate": 3.275768486860149e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279417, "epoch": 3.101010101010101, "step": 155}, {"loss": 0.6093906402587891, "token_acc": 0.8244665896098217, "grad_norm": 0.18214310705661774, "learning_rate": 2.962194068331996e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279613, "epoch": 3.202020202020202, "step": 160}, {"eval_loss": 0.6415175199508667, "eval_token_acc": 0.6998031496062992, "eval_runtime": 0.6171, "eval_samples_per_second": 6.482, "eval_steps_per_second": 6.482, "epoch": 3.202020202020202, "step": 160}, {"loss": 0.5636696338653564, "token_acc": 0.8361542855424947, "grad_norm": 0.2996014356613159, "learning_rate": 2.65795779650105e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279575, "epoch": 3.303030303030303, "step": 165}, {"loss": 0.6497210025787353, "token_acc": 0.8024000617379226, "grad_norm": 0.26891595125198364, "learning_rate": 2.3644538193049625e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.279738, "epoch": 3.404040404040404, "step": 170}, {"loss": 0.6274948596954346, "token_acc": 0.8227623198122695, "grad_norm": 0.29831352829933167, "learning_rate": 2.08302710446253e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280045, "epoch": 3.505050505050505, "step": 175}, {"loss": 0.557886791229248, "token_acc": 0.8270365200162111, "grad_norm": 0.3759668469429016, "learning_rate": 1.8149672762244624e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280492, "epoch": 3.606060606060606, "step": 180}, {"eval_loss": 0.6439455151557922, "eval_token_acc": 0.7007874015748031, "eval_runtime": 0.6302, "eval_samples_per_second": 6.347, "eval_steps_per_second": 6.347, "epoch": 3.606060606060606, "step": 180}, {"loss": 0.606973934173584, "token_acc": 0.8181412108037283, "grad_norm": 0.2696785032749176, "learning_rate": 1.561502705732883e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280333, "epoch": 3.707070707070707, "step": 185}, {"loss": 0.6212857723236084, "token_acc": 0.8139715818353057, "grad_norm": 0.1801028549671173, "learning_rate": 1.3237948820702495e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280597, "epoch": 3.808080808080808, "step": 190}, {"loss": 0.6672876834869385, "token_acc": 0.8081573933667716, "grad_norm": 0.33335137367248535, "learning_rate": 1.102933089792042e-05, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.280632, "epoch": 3.909090909090909, "step": 195}, {"loss": 0.5871144294738769, "token_acc": 0.8203874623046696, "grad_norm": 0.34127023816108704, "learning_rate": 8.999294173332058e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281583, "epoch": 4.0, "step": 200}, {"eval_loss": 0.6337329745292664, "eval_token_acc": 0.6988188976377953, "eval_runtime": 0.62, "eval_samples_per_second": 6.452, "eval_steps_per_second": 6.452, "epoch": 4.0, "step": 200}, {"loss": 0.5270902633666992, "token_acc": 0.8366300366300367, "grad_norm": 0.43939781188964844, "learning_rate": 7.157141191620548e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28149, "epoch": 4.101010101010101, "step": 205}, {"loss": 0.7644347667694091, "token_acc": 0.7781070870244919, "grad_norm": 0.2254554033279419, "learning_rate": 5.5113135293435815e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281717, "epoch": 4.202020202020202, "step": 210}, {"loss": 0.6509243011474609, "token_acc": 0.815105591421381, "grad_norm": 0.20229189097881317, "learning_rate": 4.069353111818913e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281915, "epoch": 4.303030303030303, "step": 215}, {"loss": 0.651677417755127, "token_acc": 0.808887518415733, "grad_norm": 0.33297428488731384, "learning_rate": 2.8378676526178482e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282131, "epoch": 4.404040404040404, "step": 220}, {"eval_loss": 0.6320821046829224, "eval_token_acc": 0.6978346456692913, "eval_runtime": 0.5918, "eval_samples_per_second": 6.759, "eval_steps_per_second": 6.759, "epoch": 4.404040404040404, "step": 220}, {"loss": 0.572452449798584, "token_acc": 0.8266844817713304, "grad_norm": 0.29022741317749023, "learning_rate": 1.8225003740388547e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28185, "epoch": 4.505050505050505, "step": 225}, {"loss": 0.5687718391418457, "token_acc": 0.8347073874020225, "grad_norm": 0.2930595278739929, "learning_rate": 1.0279041473154116e-06, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.281839, "epoch": 4.606060606060606, "step": 230}, {"loss": 0.5446483612060546, "token_acc": 0.8374695424600668, "grad_norm": 0.23262919485569, "learning_rate": 4.577201710596612e-07, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282162, "epoch": 4.707070707070707, "step": 235}, {"loss": 0.5697183609008789, "token_acc": 0.8293924231593995, "grad_norm": 0.25554314255714417, "learning_rate": 1.1456128564660273e-07, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.282314, "epoch": 4.808080808080808, "step": 240}, {"eval_loss": 0.6318163871765137, "eval_token_acc": 0.6978346456692913, "eval_runtime": 0.7091, "eval_samples_per_second": 5.641, "eval_steps_per_second": 5.641, "epoch": 4.808080808080808, "step": 240}, {"loss": 0.5548447608947754, "token_acc": 0.8255539449161317, "grad_norm": 0.32179728150367737, "learning_rate": 0.0, "memory(GiB)": 45.11, "train_speed(iter/s)": 0.28211, "epoch": 4.909090909090909, "step": 245}, {"eval_loss": 0.631247878074646, "eval_token_acc": 0.6968503937007874, "eval_runtime": 0.6316, "eval_samples_per_second": 6.333, "eval_steps_per_second": 6.333, "epoch": 4.909090909090909, "step": 245}, {"train_runtime": 869.3128, "train_samples_per_second": 2.278, "train_steps_per_second": 0.282, "total_flos": 1.4131540621191168e+16, "train_loss": 0.6874159601269936, "epoch": 4.909090909090909, "step": 245}], "memory": 45.109375} diff --git a/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs/events.out.tfevents.1737750121.kml-dtmachine-18088-prod.53152.0 b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs/events.out.tfevents.1737750121.kml-dtmachine-18088-prod.53152.0 new file mode 100644 index 0000000000000000000000000000000000000000..f2a43e5563bbbcc0bf668b5dd852315d9a8614cd --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-1.5b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-202114/runs/events.out.tfevents.1737750121.kml-dtmachine-18088-prod.53152.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337d95df402c7a28355cf7368ad4b95cc6ba5dcc0b81298fb14867f3ea7cb1f6 +size 29537 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0e9d699dc2e71ce794631feb0160f9d828e5df82 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/README.md b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f1cd67de22df32031c5f6c71177095244e04fe16 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "k_proj", + "v_proj", + "gate_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a328980037b2299300e721dcaaae353c379407a3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e18c6cf4b9b0f46b1b67b214d56317aea67177c41f571a93acb75c3374ce27 +size 275341720 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/additional_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0e9d699dc2e71ce794631feb0160f9d828e5df82 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/optimizer.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed6e57eca7492428040dc052047ce3892f8e0bbb --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e41233d48a0562fd480dd28e56185a8742bba7f55fcac991c7440cc9fa8ae55e +size 551070514 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/rng_state.pth b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e44fd167704ad2ad245c019bcc500ab2d1721432 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f46a0c67d07bcde056b3a2299f5f9d3f641fe5c6349c5c282bc7e8ddd9f528 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/scheduler.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a7d8bc4abba06c30f7a7a85d5d4173984e4c9c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb96daf4c69d48b38d52e8ae266563af1b957cba209e2ff4307709a50fc6770 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/trainer_state.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d1dd814de563e2e361096e5988e789a43e80b52 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_metric": 0.39637467, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240", + "epoch": 4.808080808080808, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.2041328251361847, + "learning_rate": 7.692307692307694e-06, + "loss": 0.5964576005935669, + "memory(GiB)": 38.49, + "step": 1, + "token_acc": 0.8541757761259292, + "train_speed(iter/s)": 0.027366 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.5847204327583313, + "learning_rate": 3.846153846153846e-05, + "loss": 0.904187798500061, + "memory(GiB)": 48.42, + "step": 5, + "token_acc": 0.7896981445582941, + "train_speed(iter/s)": 0.079647 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.2803400158882141, + "learning_rate": 7.692307692307693e-05, + "loss": 0.731820821762085, + "memory(GiB)": 54.88, + "step": 10, + "token_acc": 0.8111010965900556, + "train_speed(iter/s)": 0.104425 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.3598642945289612, + "learning_rate": 9.99816643111642e-05, + "loss": 0.5775248527526855, + "memory(GiB)": 61.41, + "step": 15, + "token_acc": 0.8338741721854305, + "train_speed(iter/s)": 0.115151 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.1787286400794983, + "learning_rate": 9.977554222133292e-05, + "loss": 0.5537004947662354, + "memory(GiB)": 61.41, + "step": 20, + "token_acc": 0.8298178513272996, + "train_speed(iter/s)": 0.122582 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.7036610841751099, + "eval_runtime": 0.854, + "eval_samples_per_second": 4.684, + "eval_steps_per_second": 4.684, + "eval_token_acc": 0.7283464566929134, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.12640731036663055, + "learning_rate": 9.934132612707632e-05, + "loss": 0.5114118099212647, + "memory(GiB)": 70.81, + "step": 25, + "token_acc": 0.8343587316611453, + "train_speed(iter/s)": 0.124613 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.15377166867256165, + "learning_rate": 9.868100580255466e-05, + "loss": 0.5711065292358398, + "memory(GiB)": 70.81, + "step": 30, + "token_acc": 0.8206535755074973, + "train_speed(iter/s)": 0.127818 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.14515845477581024, + "learning_rate": 9.779760713358059e-05, + "loss": 0.5313561439514161, + "memory(GiB)": 70.81, + "step": 35, + "token_acc": 0.8319673548431719, + "train_speed(iter/s)": 0.130792 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.15607544779777527, + "learning_rate": 9.669517825164434e-05, + "loss": 0.42766599655151366, + "memory(GiB)": 70.81, + "step": 40, + "token_acc": 0.8610677701648464, + "train_speed(iter/s)": 0.133018 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.5766888856887817, + "eval_runtime": 0.9472, + "eval_samples_per_second": 4.223, + "eval_steps_per_second": 4.223, + "eval_token_acc": 0.7391732283464567, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.13710518181324005, + "learning_rate": 9.537877098354786e-05, + "loss": 0.46687989234924315, + "memory(GiB)": 70.81, + "step": 45, + "token_acc": 0.8420670634344607, + "train_speed(iter/s)": 0.134336 + }, + { + "epoch": 1.0, + "grad_norm": 0.11743508279323578, + "learning_rate": 9.385441770165385e-05, + "loss": 0.5958067417144776, + "memory(GiB)": 70.81, + "step": 50, + "token_acc": 0.8129140278081095, + "train_speed(iter/s)": 0.136973 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.13310575485229492, + "learning_rate": 9.212910368083245e-05, + "loss": 0.5059752941131592, + "memory(GiB)": 70.81, + "step": 55, + "token_acc": 0.8353572437164642, + "train_speed(iter/s)": 0.137155 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.15601898729801178, + "learning_rate": 9.021073508877845e-05, + "loss": 0.44797539710998535, + "memory(GiB)": 70.81, + "step": 60, + "token_acc": 0.845841438858742, + "train_speed(iter/s)": 0.138205 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.5672237277030945, + "eval_runtime": 0.8765, + "eval_samples_per_second": 4.563, + "eval_steps_per_second": 4.563, + "eval_token_acc": 0.7332677165354331, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.17286072671413422, + "learning_rate": 8.810810275638183e-05, + "loss": 0.5194249153137207, + "memory(GiB)": 70.81, + "step": 65, + "token_acc": 0.8289147716768349, + "train_speed(iter/s)": 0.138881 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.16092103719711304, + "learning_rate": 8.583084189417224e-05, + "loss": 0.37634236812591554, + "memory(GiB)": 70.81, + "step": 70, + "token_acc": 0.8768530150753768, + "train_speed(iter/s)": 0.139264 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.1762753427028656, + "learning_rate": 8.338938793943478e-05, + "loss": 0.48749170303344724, + "memory(GiB)": 70.81, + "step": 75, + "token_acc": 0.8442603208247026, + "train_speed(iter/s)": 0.140816 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.32298213243484497, + "learning_rate": 8.079492873632554e-05, + "loss": 0.45453643798828125, + "memory(GiB)": 70.81, + "step": 80, + "token_acc": 0.8541121648136036, + "train_speed(iter/s)": 0.141607 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.5171214938163757, + "eval_runtime": 0.937, + "eval_samples_per_second": 4.269, + "eval_steps_per_second": 4.269, + "eval_token_acc": 0.7391732283464567, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.17536115646362305, + "learning_rate": 7.805935326811912e-05, + "loss": 0.4341718673706055, + "memory(GiB)": 70.81, + "step": 85, + "token_acc": 0.8526262111167772, + "train_speed(iter/s)": 0.14176 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.2086581289768219, + "learning_rate": 7.519519717652039e-05, + "loss": 0.4266983509063721, + "memory(GiB)": 70.81, + "step": 90, + "token_acc": 0.856147317604921, + "train_speed(iter/s)": 0.142225 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.18198587000370026, + "learning_rate": 7.221558531769519e-05, + "loss": 0.4482563972473145, + "memory(GiB)": 70.81, + "step": 95, + "token_acc": 0.8495209778658738, + "train_speed(iter/s)": 0.142524 + }, + { + "epoch": 2.0, + "grad_norm": 0.3201996088027954, + "learning_rate": 6.91341716182545e-05, + "loss": 0.42622933387756345, + "memory(GiB)": 70.81, + "step": 100, + "token_acc": 0.8711596082465006, + "train_speed(iter/s)": 0.143465 + }, + { + "epoch": 2.0, + "eval_loss": 0.4652397632598877, + "eval_runtime": 0.8776, + "eval_samples_per_second": 4.558, + "eval_steps_per_second": 4.558, + "eval_token_acc": 0.7470472440944882, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.2608080208301544, + "learning_rate": 6.5965076506799e-05, + "loss": 0.35508630275726316, + "memory(GiB)": 70.81, + "step": 105, + "token_acc": 0.8730065771691143, + "train_speed(iter/s)": 0.143604 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.18070940673351288, + "learning_rate": 6.272282220774091e-05, + "loss": 0.3007711172103882, + "memory(GiB)": 70.81, + "step": 110, + "token_acc": 0.9026669200557456, + "train_speed(iter/s)": 0.143611 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.2512786388397217, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.3588543891906738, + "memory(GiB)": 70.81, + "step": 115, + "token_acc": 0.8820030544168476, + "train_speed(iter/s)": 0.144208 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.4274783730506897, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.32741105556488037, + "memory(GiB)": 70.81, + "step": 120, + "token_acc": 0.8927774848939535, + "train_speed(iter/s)": 0.144535 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.44838735461235046, + "eval_runtime": 0.8624, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 4.638, + "eval_token_acc": 0.7460629921259843, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.3210482895374298, + "learning_rate": 5.270694542927088e-05, + "loss": 0.345516300201416, + "memory(GiB)": 70.81, + "step": 125, + "token_acc": 0.8747805323407543, + "train_speed(iter/s)": 0.144419 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.2985243499279022, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.384252667427063, + "memory(GiB)": 70.81, + "step": 130, + "token_acc": 0.8706323113387874, + "train_speed(iter/s)": 0.144826 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.3411076068878174, + "learning_rate": 4.594206372362845e-05, + "loss": 0.32880301475524903, + "memory(GiB)": 70.81, + "step": 135, + "token_acc": 0.8895941823560225, + "train_speed(iter/s)": 0.144704 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.3216821253299713, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.38271095752716067, + "memory(GiB)": 70.81, + "step": 140, + "token_acc": 0.8675094401576096, + "train_speed(iter/s)": 0.144971 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.46010422706604004, + "eval_runtime": 0.9301, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 4.301, + "eval_token_acc": 0.7480314960629921, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.36269646883010864, + "learning_rate": 3.92514779894488e-05, + "loss": 0.3969467878341675, + "memory(GiB)": 70.81, + "step": 145, + "token_acc": 0.8653660475947067, + "train_speed(iter/s)": 0.144702 + }, + { + "epoch": 3.0, + "grad_norm": 0.5357916355133057, + "learning_rate": 3.597244112544208e-05, + "loss": 0.4235672473907471, + "memory(GiB)": 70.81, + "step": 150, + "token_acc": 0.865672935263021, + "train_speed(iter/s)": 0.145348 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.43691495060920715, + "learning_rate": 3.275768486860149e-05, + "loss": 0.2840510606765747, + "memory(GiB)": 70.81, + "step": 155, + "token_acc": 0.9106051814178926, + "train_speed(iter/s)": 0.145398 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.2997167408466339, + "learning_rate": 2.962194068331996e-05, + "loss": 0.2537685871124268, + "memory(GiB)": 70.81, + "step": 160, + "token_acc": 0.9156227546780794, + "train_speed(iter/s)": 0.145506 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.4409746527671814, + "eval_runtime": 0.8875, + "eval_samples_per_second": 4.507, + "eval_steps_per_second": 4.507, + "eval_token_acc": 0.7490157480314961, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.5427218079566956, + "learning_rate": 2.65795779650105e-05, + "loss": 0.265443754196167, + "memory(GiB)": 70.81, + "step": 165, + "token_acc": 0.9111324895529568, + "train_speed(iter/s)": 0.145292 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.4990173876285553, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.26773905754089355, + "memory(GiB)": 70.81, + "step": 170, + "token_acc": 0.9053480475382003, + "train_speed(iter/s)": 0.145583 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.4650072455406189, + "learning_rate": 2.08302710446253e-05, + "loss": 0.2679251194000244, + "memory(GiB)": 70.81, + "step": 175, + "token_acc": 0.9111967817633255, + "train_speed(iter/s)": 0.14568 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.5438686013221741, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.2113558053970337, + "memory(GiB)": 70.82, + "step": 180, + "token_acc": 0.922006574503535, + "train_speed(iter/s)": 0.146101 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.40819939970970154, + "eval_runtime": 0.8545, + "eval_samples_per_second": 4.681, + "eval_steps_per_second": 4.681, + "eval_token_acc": 0.7421259842519685, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.4500384032726288, + "learning_rate": 1.561502705732883e-05, + "loss": 0.23794655799865722, + "memory(GiB)": 70.82, + "step": 185, + "token_acc": 0.9139989875227063, + "train_speed(iter/s)": 0.145854 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.28911277651786804, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.28251683712005615, + "memory(GiB)": 70.82, + "step": 190, + "token_acc": 0.8974243478973015, + "train_speed(iter/s)": 0.146186 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.5827483534812927, + "learning_rate": 1.102933089792042e-05, + "loss": 0.23283705711364747, + "memory(GiB)": 70.82, + "step": 195, + "token_acc": 0.9216532524592713, + "train_speed(iter/s)": 0.146399 + }, + { + "epoch": 4.0, + "grad_norm": 0.6791924238204956, + "learning_rate": 8.999294173332058e-06, + "loss": 0.2710264205932617, + "memory(GiB)": 70.82, + "step": 200, + "token_acc": 0.9085716896646258, + "train_speed(iter/s)": 0.146896 + }, + { + "epoch": 4.0, + "eval_loss": 0.3976582884788513, + "eval_runtime": 0.9014, + "eval_samples_per_second": 4.438, + "eval_steps_per_second": 4.438, + "eval_token_acc": 0.7460629921259843, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.6375091671943665, + "learning_rate": 7.157141191620548e-06, + "loss": 0.16443511247634887, + "memory(GiB)": 70.82, + "step": 205, + "token_acc": 0.9361095045305572, + "train_speed(iter/s)": 0.146797 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.42739006876945496, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.23344297409057618, + "memory(GiB)": 70.82, + "step": 210, + "token_acc": 0.9198150078165711, + "train_speed(iter/s)": 0.146669 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.38036471605300903, + "learning_rate": 4.069353111818913e-06, + "loss": 0.24862170219421387, + "memory(GiB)": 70.82, + "step": 215, + "token_acc": 0.9194660247291826, + "train_speed(iter/s)": 0.146512 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.47875192761421204, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.2238011837005615, + "memory(GiB)": 70.82, + "step": 220, + "token_acc": 0.9291122760132936, + "train_speed(iter/s)": 0.146645 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.3966984152793884, + "eval_runtime": 1.0375, + "eval_samples_per_second": 3.856, + "eval_steps_per_second": 3.856, + "eval_token_acc": 0.7460629921259843, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.4872428774833679, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.17960745096206665, + "memory(GiB)": 70.82, + "step": 225, + "token_acc": 0.9339186867722847, + "train_speed(iter/s)": 0.146615 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.5575739741325378, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.17248222827911378, + "memory(GiB)": 70.82, + "step": 230, + "token_acc": 0.9405840068228891, + "train_speed(iter/s)": 0.146886 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.4236765205860138, + "learning_rate": 4.577201710596612e-07, + "loss": 0.2282116413116455, + "memory(GiB)": 70.82, + "step": 235, + "token_acc": 0.92505189062359, + "train_speed(iter/s)": 0.147141 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.5662180781364441, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.18331977128982543, + "memory(GiB)": 70.82, + "step": 240, + "token_acc": 0.9365260900643316, + "train_speed(iter/s)": 0.146712 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.3963746726512909, + "eval_runtime": 0.9453, + "eval_samples_per_second": 4.231, + "eval_steps_per_second": 4.231, + "eval_token_acc": 0.7470472440944882, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2652226225455104e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/training_args.bin b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..141e292a060454c1eda26060e9efc98995dc5b9e --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78eca164fcc139cc157937f9854f57f6fbdb69006d1969ea89d1977af2f064d4 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/README.md b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f1cd67de22df32031c5f6c71177095244e04fe16 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "k_proj", + "v_proj", + "gate_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13af5d92830ee28da946ebcc384cee908342f02f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d384f7574e8ed201855a317667f76adf267e8aca5cab3d8246bb2b49db9763 +size 275341720 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/additional_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0e9d699dc2e71ce794631feb0160f9d828e5df82 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/optimizer.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a81d667064742b1f1c45ee7f02ea52a400f0fe2 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ef47cc66f74c79ce5ea6481b2c1b92ce9ebf3b04916d51d775e6bd47e0467f +size 551070514 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/rng_state.pth b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f6518780e1d06dd19fe80d7cc2cd48390121d22 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d15ac0ed47a4722419b11d617539b531bc3b39c94ffd342cf461d1bbccfe51 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/scheduler.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c07b610e877e513fda3813a64af716a38654c2f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ba022925b97a0c60fdb73ede217e52b3b55c5065f112ff19fea77b6a69dd5d +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/trainer_state.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9304907775f2fa1687b304e8790bbe01328e1f24 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/trainer_state.json @@ -0,0 +1,650 @@ +{ + "best_metric": 0.3948561, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245", + "epoch": 4.909090909090909, + "eval_steps": 20, + "global_step": 245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.2041328251361847, + "learning_rate": 7.692307692307694e-06, + "loss": 0.5964576005935669, + "memory(GiB)": 38.49, + "step": 1, + "token_acc": 0.8541757761259292, + "train_speed(iter/s)": 0.027366 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.5847204327583313, + "learning_rate": 3.846153846153846e-05, + "loss": 0.904187798500061, + "memory(GiB)": 48.42, + "step": 5, + "token_acc": 0.7896981445582941, + "train_speed(iter/s)": 0.079647 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.2803400158882141, + "learning_rate": 7.692307692307693e-05, + "loss": 0.731820821762085, + "memory(GiB)": 54.88, + "step": 10, + "token_acc": 0.8111010965900556, + "train_speed(iter/s)": 0.104425 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.3598642945289612, + "learning_rate": 9.99816643111642e-05, + "loss": 0.5775248527526855, + "memory(GiB)": 61.41, + "step": 15, + "token_acc": 0.8338741721854305, + "train_speed(iter/s)": 0.115151 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.1787286400794983, + "learning_rate": 9.977554222133292e-05, + "loss": 0.5537004947662354, + "memory(GiB)": 61.41, + "step": 20, + "token_acc": 0.8298178513272996, + "train_speed(iter/s)": 0.122582 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.7036610841751099, + "eval_runtime": 0.854, + "eval_samples_per_second": 4.684, + "eval_steps_per_second": 4.684, + "eval_token_acc": 0.7283464566929134, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.12640731036663055, + "learning_rate": 9.934132612707632e-05, + "loss": 0.5114118099212647, + "memory(GiB)": 70.81, + "step": 25, + "token_acc": 0.8343587316611453, + "train_speed(iter/s)": 0.124613 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.15377166867256165, + "learning_rate": 9.868100580255466e-05, + "loss": 0.5711065292358398, + "memory(GiB)": 70.81, + "step": 30, + "token_acc": 0.8206535755074973, + "train_speed(iter/s)": 0.127818 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.14515845477581024, + "learning_rate": 9.779760713358059e-05, + "loss": 0.5313561439514161, + "memory(GiB)": 70.81, + "step": 35, + "token_acc": 0.8319673548431719, + "train_speed(iter/s)": 0.130792 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.15607544779777527, + "learning_rate": 9.669517825164434e-05, + "loss": 0.42766599655151366, + "memory(GiB)": 70.81, + "step": 40, + "token_acc": 0.8610677701648464, + "train_speed(iter/s)": 0.133018 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.5766888856887817, + "eval_runtime": 0.9472, + "eval_samples_per_second": 4.223, + "eval_steps_per_second": 4.223, + "eval_token_acc": 0.7391732283464567, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.13710518181324005, + "learning_rate": 9.537877098354786e-05, + "loss": 0.46687989234924315, + "memory(GiB)": 70.81, + "step": 45, + "token_acc": 0.8420670634344607, + "train_speed(iter/s)": 0.134336 + }, + { + "epoch": 1.0, + "grad_norm": 0.11743508279323578, + "learning_rate": 9.385441770165385e-05, + "loss": 0.5958067417144776, + "memory(GiB)": 70.81, + "step": 50, + "token_acc": 0.8129140278081095, + "train_speed(iter/s)": 0.136973 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.13310575485229492, + "learning_rate": 9.212910368083245e-05, + "loss": 0.5059752941131592, + "memory(GiB)": 70.81, + "step": 55, + "token_acc": 0.8353572437164642, + "train_speed(iter/s)": 0.137155 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.15601898729801178, + "learning_rate": 9.021073508877845e-05, + "loss": 0.44797539710998535, + "memory(GiB)": 70.81, + "step": 60, + "token_acc": 0.845841438858742, + "train_speed(iter/s)": 0.138205 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.5672237277030945, + "eval_runtime": 0.8765, + "eval_samples_per_second": 4.563, + "eval_steps_per_second": 4.563, + "eval_token_acc": 0.7332677165354331, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.17286072671413422, + "learning_rate": 8.810810275638183e-05, + "loss": 0.5194249153137207, + "memory(GiB)": 70.81, + "step": 65, + "token_acc": 0.8289147716768349, + "train_speed(iter/s)": 0.138881 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.16092103719711304, + "learning_rate": 8.583084189417224e-05, + "loss": 0.37634236812591554, + "memory(GiB)": 70.81, + "step": 70, + "token_acc": 0.8768530150753768, + "train_speed(iter/s)": 0.139264 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.1762753427028656, + "learning_rate": 8.338938793943478e-05, + "loss": 0.48749170303344724, + "memory(GiB)": 70.81, + "step": 75, + "token_acc": 0.8442603208247026, + "train_speed(iter/s)": 0.140816 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.32298213243484497, + "learning_rate": 8.079492873632554e-05, + "loss": 0.45453643798828125, + "memory(GiB)": 70.81, + "step": 80, + "token_acc": 0.8541121648136036, + "train_speed(iter/s)": 0.141607 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.5171214938163757, + "eval_runtime": 0.937, + "eval_samples_per_second": 4.269, + "eval_steps_per_second": 4.269, + "eval_token_acc": 0.7391732283464567, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.17536115646362305, + "learning_rate": 7.805935326811912e-05, + "loss": 0.4341718673706055, + "memory(GiB)": 70.81, + "step": 85, + "token_acc": 0.8526262111167772, + "train_speed(iter/s)": 0.14176 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.2086581289768219, + "learning_rate": 7.519519717652039e-05, + "loss": 0.4266983509063721, + "memory(GiB)": 70.81, + "step": 90, + "token_acc": 0.856147317604921, + "train_speed(iter/s)": 0.142225 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.18198587000370026, + "learning_rate": 7.221558531769519e-05, + "loss": 0.4482563972473145, + "memory(GiB)": 70.81, + "step": 95, + "token_acc": 0.8495209778658738, + "train_speed(iter/s)": 0.142524 + }, + { + "epoch": 2.0, + "grad_norm": 0.3201996088027954, + "learning_rate": 6.91341716182545e-05, + "loss": 0.42622933387756345, + "memory(GiB)": 70.81, + "step": 100, + "token_acc": 0.8711596082465006, + "train_speed(iter/s)": 0.143465 + }, + { + "epoch": 2.0, + "eval_loss": 0.4652397632598877, + "eval_runtime": 0.8776, + "eval_samples_per_second": 4.558, + "eval_steps_per_second": 4.558, + "eval_token_acc": 0.7470472440944882, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.2608080208301544, + "learning_rate": 6.5965076506799e-05, + "loss": 0.35508630275726316, + "memory(GiB)": 70.81, + "step": 105, + "token_acc": 0.8730065771691143, + "train_speed(iter/s)": 0.143604 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.18070940673351288, + "learning_rate": 6.272282220774091e-05, + "loss": 0.3007711172103882, + "memory(GiB)": 70.81, + "step": 110, + "token_acc": 0.9026669200557456, + "train_speed(iter/s)": 0.143611 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.2512786388397217, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.3588543891906738, + "memory(GiB)": 70.81, + "step": 115, + "token_acc": 0.8820030544168476, + "train_speed(iter/s)": 0.144208 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.4274783730506897, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.32741105556488037, + "memory(GiB)": 70.81, + "step": 120, + "token_acc": 0.8927774848939535, + "train_speed(iter/s)": 0.144535 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.44838735461235046, + "eval_runtime": 0.8624, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 4.638, + "eval_token_acc": 0.7460629921259843, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.3210482895374298, + "learning_rate": 5.270694542927088e-05, + "loss": 0.345516300201416, + "memory(GiB)": 70.81, + "step": 125, + "token_acc": 0.8747805323407543, + "train_speed(iter/s)": 0.144419 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.2985243499279022, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.384252667427063, + "memory(GiB)": 70.81, + "step": 130, + "token_acc": 0.8706323113387874, + "train_speed(iter/s)": 0.144826 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.3411076068878174, + "learning_rate": 4.594206372362845e-05, + "loss": 0.32880301475524903, + "memory(GiB)": 70.81, + "step": 135, + "token_acc": 0.8895941823560225, + "train_speed(iter/s)": 0.144704 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.3216821253299713, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.38271095752716067, + "memory(GiB)": 70.81, + "step": 140, + "token_acc": 0.8675094401576096, + "train_speed(iter/s)": 0.144971 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.46010422706604004, + "eval_runtime": 0.9301, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 4.301, + "eval_token_acc": 0.7480314960629921, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.36269646883010864, + "learning_rate": 3.92514779894488e-05, + "loss": 0.3969467878341675, + "memory(GiB)": 70.81, + "step": 145, + "token_acc": 0.8653660475947067, + "train_speed(iter/s)": 0.144702 + }, + { + "epoch": 3.0, + "grad_norm": 0.5357916355133057, + "learning_rate": 3.597244112544208e-05, + "loss": 0.4235672473907471, + "memory(GiB)": 70.81, + "step": 150, + "token_acc": 0.865672935263021, + "train_speed(iter/s)": 0.145348 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.43691495060920715, + "learning_rate": 3.275768486860149e-05, + "loss": 0.2840510606765747, + "memory(GiB)": 70.81, + "step": 155, + "token_acc": 0.9106051814178926, + "train_speed(iter/s)": 0.145398 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.2997167408466339, + "learning_rate": 2.962194068331996e-05, + "loss": 0.2537685871124268, + "memory(GiB)": 70.81, + "step": 160, + "token_acc": 0.9156227546780794, + "train_speed(iter/s)": 0.145506 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.4409746527671814, + "eval_runtime": 0.8875, + "eval_samples_per_second": 4.507, + "eval_steps_per_second": 4.507, + "eval_token_acc": 0.7490157480314961, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.5427218079566956, + "learning_rate": 2.65795779650105e-05, + "loss": 0.265443754196167, + "memory(GiB)": 70.81, + "step": 165, + "token_acc": 0.9111324895529568, + "train_speed(iter/s)": 0.145292 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.4990173876285553, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.26773905754089355, + "memory(GiB)": 70.81, + "step": 170, + "token_acc": 0.9053480475382003, + "train_speed(iter/s)": 0.145583 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.4650072455406189, + "learning_rate": 2.08302710446253e-05, + "loss": 0.2679251194000244, + "memory(GiB)": 70.81, + "step": 175, + "token_acc": 0.9111967817633255, + "train_speed(iter/s)": 0.14568 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.5438686013221741, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.2113558053970337, + "memory(GiB)": 70.82, + "step": 180, + "token_acc": 0.922006574503535, + "train_speed(iter/s)": 0.146101 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.40819939970970154, + "eval_runtime": 0.8545, + "eval_samples_per_second": 4.681, + "eval_steps_per_second": 4.681, + "eval_token_acc": 0.7421259842519685, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.4500384032726288, + "learning_rate": 1.561502705732883e-05, + "loss": 0.23794655799865722, + "memory(GiB)": 70.82, + "step": 185, + "token_acc": 0.9139989875227063, + "train_speed(iter/s)": 0.145854 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.28911277651786804, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.28251683712005615, + "memory(GiB)": 70.82, + "step": 190, + "token_acc": 0.8974243478973015, + "train_speed(iter/s)": 0.146186 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.5827483534812927, + "learning_rate": 1.102933089792042e-05, + "loss": 0.23283705711364747, + "memory(GiB)": 70.82, + "step": 195, + "token_acc": 0.9216532524592713, + "train_speed(iter/s)": 0.146399 + }, + { + "epoch": 4.0, + "grad_norm": 0.6791924238204956, + "learning_rate": 8.999294173332058e-06, + "loss": 0.2710264205932617, + "memory(GiB)": 70.82, + "step": 200, + "token_acc": 0.9085716896646258, + "train_speed(iter/s)": 0.146896 + }, + { + "epoch": 4.0, + "eval_loss": 0.3976582884788513, + "eval_runtime": 0.9014, + "eval_samples_per_second": 4.438, + "eval_steps_per_second": 4.438, + "eval_token_acc": 0.7460629921259843, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.6375091671943665, + "learning_rate": 7.157141191620548e-06, + "loss": 0.16443511247634887, + "memory(GiB)": 70.82, + "step": 205, + "token_acc": 0.9361095045305572, + "train_speed(iter/s)": 0.146797 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.42739006876945496, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.23344297409057618, + "memory(GiB)": 70.82, + "step": 210, + "token_acc": 0.9198150078165711, + "train_speed(iter/s)": 0.146669 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.38036471605300903, + "learning_rate": 4.069353111818913e-06, + "loss": 0.24862170219421387, + "memory(GiB)": 70.82, + "step": 215, + "token_acc": 0.9194660247291826, + "train_speed(iter/s)": 0.146512 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.47875192761421204, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.2238011837005615, + "memory(GiB)": 70.82, + "step": 220, + "token_acc": 0.9291122760132936, + "train_speed(iter/s)": 0.146645 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.3966984152793884, + "eval_runtime": 1.0375, + "eval_samples_per_second": 3.856, + "eval_steps_per_second": 3.856, + "eval_token_acc": 0.7460629921259843, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.4872428774833679, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.17960745096206665, + "memory(GiB)": 70.82, + "step": 225, + "token_acc": 0.9339186867722847, + "train_speed(iter/s)": 0.146615 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.5575739741325378, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.17248222827911378, + "memory(GiB)": 70.82, + "step": 230, + "token_acc": 0.9405840068228891, + "train_speed(iter/s)": 0.146886 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.4236765205860138, + "learning_rate": 4.577201710596612e-07, + "loss": 0.2282116413116455, + "memory(GiB)": 70.82, + "step": 235, + "token_acc": 0.92505189062359, + "train_speed(iter/s)": 0.147141 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.5662180781364441, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.18331977128982543, + "memory(GiB)": 70.82, + "step": 240, + "token_acc": 0.9365260900643316, + "train_speed(iter/s)": 0.146712 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.3963746726512909, + "eval_runtime": 0.9453, + "eval_samples_per_second": 4.231, + "eval_steps_per_second": 4.231, + "eval_token_acc": 0.7470472440944882, + "step": 240 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.5550493001937866, + "learning_rate": 0.0, + "loss": 0.21774885654449463, + "memory(GiB)": 70.82, + "step": 245, + "token_acc": 0.9170842824601366, + "train_speed(iter/s)": 0.146816 + }, + { + "epoch": 4.909090909090909, + "eval_loss": 0.3948560953140259, + "eval_runtime": 0.8762, + "eval_samples_per_second": 4.565, + "eval_steps_per_second": 4.565, + "eval_token_acc": 0.7450787401574803, + "step": 245 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2874308328848384e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/training_args.bin b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..141e292a060454c1eda26060e9efc98995dc5b9e --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78eca164fcc139cc157937f9854f57f6fbdb69006d1969ea89d1977af2f064d4 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b27edd9eac875d6f5aa748eb99d1d51f51557cca Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..5b1707b51399dad988ef4620b732f45e3d445fa4 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..c6bc3e49569e327ce83b9566462af2e3c17e27b2 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..592311d099a3984478eb41c573212a60d7210b75 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..d41f12cdd21282ab74f5e631179b71a3257304f7 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..9cc2f5018175ad195aab7254264c4c8fdcaa870b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..3d541077eaa4821d41494ced06e4f0b11c6d577a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..1a747bb27a6bf92800a1028c0eaaaa4305471d7f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..efb189764a3ecd90e2d71a8643f24c4be062737a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..5889cfaf18ea27780b4a300d8f6b22b3a4ff4467 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..05ae1b78ba680bc61b225ec4e942d03a43abced1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..a8bdbcd9a32f1043781d591b03afa89364690c43 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..3e8705a77e19f38a5ff657fe2f8a450b1f5f99fb Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..590f6908445b4cdf0d7976d00e791492e4b6045a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..833c2d06bcdd18cdf63bd6490d8a10a92408d830 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..c51f31b438f0ee4a2ecd4e40b8ef6c822bb9695a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..6d242da8e9cd842011b4e37097202e483a6df41a Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/logging.jsonl b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5d73af995ec5fffbc84e05b8f711c9bfd11a2f1c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/logging.jsonl @@ -0,0 +1,65 @@ +{"loss": 0.5964576, "token_acc": 0.85417578, "grad_norm": 0.20413283, "learning_rate": 7.69e-06, "memory(GiB)": 38.49, "train_speed(iter/s)": 0.027366, "epoch": 0.02020202, "global_step/max_steps": "1/245", "percentage": "0.41%", "elapsed_time": "36s", "remaining_time": "2h 26m 45s"} +{"loss": 0.9041878, "token_acc": 0.78969814, "grad_norm": 0.58472043, "learning_rate": 3.846e-05, "memory(GiB)": 48.42, "train_speed(iter/s)": 0.079647, "epoch": 0.1010101, "global_step/max_steps": "5/245", "percentage": "2.04%", "elapsed_time": "1m 2s", "remaining_time": "49m 51s"} +{"loss": 0.73182082, "token_acc": 0.8111011, "grad_norm": 0.28034002, "learning_rate": 7.692e-05, "memory(GiB)": 54.88, "train_speed(iter/s)": 0.104425, "epoch": 0.2020202, "global_step/max_steps": "10/245", "percentage": "4.08%", "elapsed_time": "1m 35s", "remaining_time": "37m 19s"} +{"loss": 0.57752485, "token_acc": 0.83387417, "grad_norm": 0.35986429, "learning_rate": 9.998e-05, "memory(GiB)": 61.41, "train_speed(iter/s)": 0.115151, "epoch": 0.3030303, "global_step/max_steps": "15/245", "percentage": "6.12%", "elapsed_time": "2m 9s", "remaining_time": "33m 10s"} +{"loss": 0.55370049, "token_acc": 0.82981785, "grad_norm": 0.17872864, "learning_rate": 9.978e-05, "memory(GiB)": 61.41, "train_speed(iter/s)": 0.122582, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "2m 42s", "remaining_time": "30m 30s"} +{"eval_loss": 0.70366108, "eval_token_acc": 0.72834646, "eval_runtime": 0.854, "eval_samples_per_second": 4.684, "eval_steps_per_second": 4.684, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "2m 43s", "remaining_time": "30m 40s"} +{"loss": 0.51141181, "token_acc": 0.83435873, "grad_norm": 0.12640731, "learning_rate": 9.934e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.124613, "epoch": 0.50505051, "global_step/max_steps": "25/245", "percentage": "10.20%", "elapsed_time": "3m 20s", "remaining_time": "29m 21s"} +{"loss": 0.57110653, "token_acc": 0.82065358, "grad_norm": 0.15377167, "learning_rate": 9.868e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.127818, "epoch": 0.60606061, "global_step/max_steps": "30/245", "percentage": "12.24%", "elapsed_time": "3m 54s", "remaining_time": "27m 58s"} +{"loss": 0.53135614, "token_acc": 0.83196735, "grad_norm": 0.14515845, "learning_rate": 9.78e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.130792, "epoch": 0.70707071, "global_step/max_steps": "35/245", "percentage": "14.29%", "elapsed_time": "4m 27s", "remaining_time": "26m 42s"} +{"loss": 0.427666, "token_acc": 0.86106777, "grad_norm": 0.15607545, "learning_rate": 9.67e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.133018, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "5m 0s", "remaining_time": "25m 38s"} +{"eval_loss": 0.57668889, "eval_token_acc": 0.73917323, "eval_runtime": 0.9472, "eval_samples_per_second": 4.223, "eval_steps_per_second": 4.223, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "5m 1s", "remaining_time": "25m 43s"} +{"loss": 0.46687989, "token_acc": 0.84206706, "grad_norm": 0.13710518, "learning_rate": 9.538e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.134336, "epoch": 0.90909091, "global_step/max_steps": "45/245", "percentage": "18.37%", "elapsed_time": "5m 34s", "remaining_time": "24m 46s"} +{"loss": 0.59580674, "token_acc": 0.81291403, "grad_norm": 0.11743508, "learning_rate": 9.385e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.136973, "epoch": 1.0, "global_step/max_steps": "50/245", "percentage": "20.41%", "elapsed_time": "6m 4s", "remaining_time": "23m 41s"} +{"loss": 0.50597529, "token_acc": 0.83535724, "grad_norm": 0.13310575, "learning_rate": 9.213e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.137155, "epoch": 1.1010101, "global_step/max_steps": "55/245", "percentage": "22.45%", "elapsed_time": "6m 40s", "remaining_time": "23m 3s"} +{"loss": 0.4479754, "token_acc": 0.84584144, "grad_norm": 0.15601899, "learning_rate": 9.021e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.138205, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "7m 13s", "remaining_time": "22m 17s"} +{"eval_loss": 0.56722373, "eval_token_acc": 0.73326772, "eval_runtime": 0.8765, "eval_samples_per_second": 4.563, "eval_steps_per_second": 4.563, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "7m 14s", "remaining_time": "22m 19s"} +{"loss": 0.51942492, "token_acc": 0.82891477, "grad_norm": 0.17286073, "learning_rate": 8.811e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.138881, "epoch": 1.3030303, "global_step/max_steps": "65/245", "percentage": "26.53%", "elapsed_time": "7m 47s", "remaining_time": "21m 34s"} +{"loss": 0.37634237, "token_acc": 0.87685302, "grad_norm": 0.16092104, "learning_rate": 8.583e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.139264, "epoch": 1.4040404, "global_step/max_steps": "70/245", "percentage": "28.57%", "elapsed_time": "8m 22s", "remaining_time": "20m 55s"} +{"loss": 0.4874917, "token_acc": 0.84426032, "grad_norm": 0.17627534, "learning_rate": 8.339e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.140816, "epoch": 1.50505051, "global_step/max_steps": "75/245", "percentage": "30.61%", "elapsed_time": "8m 52s", "remaining_time": "20m 6s"} +{"loss": 0.45453644, "token_acc": 0.85411216, "grad_norm": 0.32298213, "learning_rate": 8.079e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.141607, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "9m 24s", "remaining_time": "19m 24s"} +{"eval_loss": 0.51712149, "eval_token_acc": 0.73917323, "eval_runtime": 0.937, "eval_samples_per_second": 4.269, "eval_steps_per_second": 4.269, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "9m 25s", "remaining_time": "19m 26s"} +{"loss": 0.43417187, "token_acc": 0.85262621, "grad_norm": 0.17536116, "learning_rate": 7.806e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.14176, "epoch": 1.70707071, "global_step/max_steps": "85/245", "percentage": "34.69%", "elapsed_time": "9m 59s", "remaining_time": "18m 47s"} +{"loss": 0.42669835, "token_acc": 0.85614732, "grad_norm": 0.20865813, "learning_rate": 7.52e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.142225, "epoch": 1.80808081, "global_step/max_steps": "90/245", "percentage": "36.73%", "elapsed_time": "10m 32s", "remaining_time": "18m 9s"} +{"loss": 0.4482564, "token_acc": 0.84952098, "grad_norm": 0.18198587, "learning_rate": 7.222e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.142524, "epoch": 1.90909091, "global_step/max_steps": "95/245", "percentage": "38.78%", "elapsed_time": "11m 6s", "remaining_time": "17m 31s"} +{"loss": 0.42622933, "token_acc": 0.87115961, "grad_norm": 0.32019961, "learning_rate": 6.913e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143465, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "11m 36s", "remaining_time": "16m 50s"} +{"eval_loss": 0.46523976, "eval_token_acc": 0.74704724, "eval_runtime": 0.8776, "eval_samples_per_second": 4.558, "eval_steps_per_second": 4.558, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "11m 37s", "remaining_time": "16m 51s"} +{"loss": 0.3550863, "token_acc": 0.87300658, "grad_norm": 0.26080802, "learning_rate": 6.597e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143604, "epoch": 2.1010101, "global_step/max_steps": "105/245", "percentage": "42.86%", "elapsed_time": "12m 10s", "remaining_time": "16m 14s"} +{"loss": 0.30077112, "token_acc": 0.90266692, "grad_norm": 0.18070941, "learning_rate": 6.272e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143611, "epoch": 2.2020202, "global_step/max_steps": "110/245", "percentage": "44.90%", "elapsed_time": "12m 45s", "remaining_time": "15m 39s"} +{"loss": 0.35885439, "token_acc": 0.88200305, "grad_norm": 0.25127864, "learning_rate": 5.942e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144208, "epoch": 2.3030303, "global_step/max_steps": "115/245", "percentage": "46.94%", "elapsed_time": "13m 17s", "remaining_time": "15m 0s"} +{"loss": 0.32741106, "token_acc": 0.89277748, "grad_norm": 0.42747837, "learning_rate": 5.608e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144535, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "13m 49s", "remaining_time": "14m 24s"} +{"eval_loss": 0.44838735, "eval_token_acc": 0.74606299, "eval_runtime": 0.8624, "eval_samples_per_second": 4.638, "eval_steps_per_second": 4.638, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "13m 50s", "remaining_time": "14m 25s"} +{"loss": 0.3455163, "token_acc": 0.87478053, "grad_norm": 0.32104829, "learning_rate": 5.271e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144419, "epoch": 2.50505051, "global_step/max_steps": "125/245", "percentage": "51.02%", "elapsed_time": "14m 25s", "remaining_time": "13m 50s"} +{"loss": 0.38425267, "token_acc": 0.87063231, "grad_norm": 0.29852435, "learning_rate": 4.932e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144826, "epoch": 2.60606061, "global_step/max_steps": "130/245", "percentage": "53.06%", "elapsed_time": "14m 57s", "remaining_time": "13m 13s"} +{"loss": 0.32880301, "token_acc": 0.88959418, "grad_norm": 0.34110761, "learning_rate": 4.594e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144704, "epoch": 2.70707071, "global_step/max_steps": "135/245", "percentage": "55.10%", "elapsed_time": "15m 32s", "remaining_time": "12m 39s"} +{"loss": 0.38271096, "token_acc": 0.86750944, "grad_norm": 0.32168213, "learning_rate": 4.258e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144971, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "16m 5s", "remaining_time": "12m 3s"} +{"eval_loss": 0.46010423, "eval_token_acc": 0.7480315, "eval_runtime": 0.9301, "eval_samples_per_second": 4.301, "eval_steps_per_second": 4.301, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "16m 6s", "remaining_time": "12m 4s"} +{"loss": 0.39694679, "token_acc": 0.86536605, "grad_norm": 0.36269647, "learning_rate": 3.925e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144702, "epoch": 2.90909091, "global_step/max_steps": "145/245", "percentage": "59.18%", "elapsed_time": "16m 41s", "remaining_time": "11m 30s"} +{"loss": 0.42356725, "token_acc": 0.86567294, "grad_norm": 0.53579164, "learning_rate": 3.597e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145348, "epoch": 3.0, "global_step/max_steps": "150/245", "percentage": "61.22%", "elapsed_time": "17m 11s", "remaining_time": "10m 53s"} +{"loss": 0.28405106, "token_acc": 0.91060518, "grad_norm": 0.43691495, "learning_rate": 3.276e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145398, "epoch": 3.1010101, "global_step/max_steps": "155/245", "percentage": "63.27%", "elapsed_time": "17m 45s", "remaining_time": "10m 18s"} +{"loss": 0.25376859, "token_acc": 0.91562275, "grad_norm": 0.29971674, "learning_rate": 2.962e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145506, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "18m 19s", "remaining_time": "9m 43s"} +{"eval_loss": 0.44097465, "eval_token_acc": 0.74901575, "eval_runtime": 0.8875, "eval_samples_per_second": 4.507, "eval_steps_per_second": 4.507, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "18m 20s", "remaining_time": "9m 44s"} +{"loss": 0.26544375, "token_acc": 0.91113249, "grad_norm": 0.54272181, "learning_rate": 2.658e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145292, "epoch": 3.3030303, "global_step/max_steps": "165/245", "percentage": "67.35%", "elapsed_time": "18m 55s", "remaining_time": "9m 10s"} +{"loss": 0.26773906, "token_acc": 0.90534805, "grad_norm": 0.49901739, "learning_rate": 2.364e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145583, "epoch": 3.4040404, "global_step/max_steps": "170/245", "percentage": "69.39%", "elapsed_time": "19m 27s", "remaining_time": "8m 34s"} +{"loss": 0.26792512, "token_acc": 0.91119678, "grad_norm": 0.46500725, "learning_rate": 2.083e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.14568, "epoch": 3.50505051, "global_step/max_steps": "175/245", "percentage": "71.43%", "elapsed_time": "20m 0s", "remaining_time": "8m 0s"} +{"loss": 0.21135581, "token_acc": 0.92200657, "grad_norm": 0.5438686, "learning_rate": 1.815e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146101, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "20m 31s", "remaining_time": "7m 24s"} +{"eval_loss": 0.4081994, "eval_token_acc": 0.74212598, "eval_runtime": 0.8545, "eval_samples_per_second": 4.681, "eval_steps_per_second": 4.681, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "20m 32s", "remaining_time": "7m 25s"} +{"loss": 0.23794656, "token_acc": 0.91399899, "grad_norm": 0.4500384, "learning_rate": 1.562e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.145854, "epoch": 3.70707071, "global_step/max_steps": "185/245", "percentage": "75.51%", "elapsed_time": "21m 7s", "remaining_time": "6m 51s"} +{"loss": 0.28251684, "token_acc": 0.89742435, "grad_norm": 0.28911278, "learning_rate": 1.324e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146186, "epoch": 3.80808081, "global_step/max_steps": "190/245", "percentage": "77.55%", "elapsed_time": "21m 39s", "remaining_time": "6m 16s"} +{"loss": 0.23283706, "token_acc": 0.92165325, "grad_norm": 0.58274835, "learning_rate": 1.103e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146399, "epoch": 3.90909091, "global_step/max_steps": "195/245", "percentage": "79.59%", "elapsed_time": "22m 11s", "remaining_time": "5m 41s"} +{"loss": 0.27102642, "token_acc": 0.90857169, "grad_norm": 0.67919242, "learning_rate": 9e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146896, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "22m 41s", "remaining_time": "5m 6s"} +{"eval_loss": 0.39765829, "eval_token_acc": 0.74606299, "eval_runtime": 0.9014, "eval_samples_per_second": 4.438, "eval_steps_per_second": 4.438, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "22m 41s", "remaining_time": "5m 6s"} +{"loss": 0.16443511, "token_acc": 0.9361095, "grad_norm": 0.63750917, "learning_rate": 7.16e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146797, "epoch": 4.1010101, "global_step/max_steps": "205/245", "percentage": "83.67%", "elapsed_time": "23m 16s", "remaining_time": "4m 32s"} +{"loss": 0.23344297, "token_acc": 0.91981501, "grad_norm": 0.42739007, "learning_rate": 5.51e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146669, "epoch": 4.2020202, "global_step/max_steps": "210/245", "percentage": "85.71%", "elapsed_time": "23m 51s", "remaining_time": "3m 58s"} +{"loss": 0.2486217, "token_acc": 0.91946602, "grad_norm": 0.38036472, "learning_rate": 4.07e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146512, "epoch": 4.3030303, "global_step/max_steps": "215/245", "percentage": "87.76%", "elapsed_time": "24m 26s", "remaining_time": "3m 24s"} +{"loss": 0.22380118, "token_acc": 0.92911228, "grad_norm": 0.47875193, "learning_rate": 2.84e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146645, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "24m 59s", "remaining_time": "2m 50s"} +{"eval_loss": 0.39669842, "eval_token_acc": 0.74606299, "eval_runtime": 1.0375, "eval_samples_per_second": 3.856, "eval_steps_per_second": 3.856, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "25m 0s", "remaining_time": "2m 50s"} +{"loss": 0.17960745, "token_acc": 0.93391869, "grad_norm": 0.48724288, "learning_rate": 1.82e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146615, "epoch": 4.50505051, "global_step/max_steps": "225/245", "percentage": "91.84%", "elapsed_time": "25m 34s", "remaining_time": "2m 16s"} +{"loss": 0.17248223, "token_acc": 0.94058401, "grad_norm": 0.55757397, "learning_rate": 1.03e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146886, "epoch": 4.60606061, "global_step/max_steps": "230/245", "percentage": "93.88%", "elapsed_time": "26m 5s", "remaining_time": "1m 42s"} +{"loss": 0.22821164, "token_acc": 0.92505189, "grad_norm": 0.42367652, "learning_rate": 4.6e-07, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.147141, "epoch": 4.70707071, "global_step/max_steps": "235/245", "percentage": "95.92%", "elapsed_time": "26m 36s", "remaining_time": "1m 7s"} +{"loss": 0.18331977, "token_acc": 0.93652609, "grad_norm": 0.56621808, "learning_rate": 1.1e-07, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146712, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "27m 15s", "remaining_time": "34s"} +{"eval_loss": 0.39637467, "eval_token_acc": 0.74704724, "eval_runtime": 0.9453, "eval_samples_per_second": 4.231, "eval_steps_per_second": 4.231, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "27m 16s", "remaining_time": "34s"} +{"loss": 0.21774886, "token_acc": 0.91708428, "grad_norm": 0.5550493, "learning_rate": 0.0, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146816, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 48s", "remaining_time": "0s"} +{"eval_loss": 0.3948561, "eval_token_acc": 0.74507874, "eval_runtime": 0.8762, "eval_samples_per_second": 4.565, "eval_steps_per_second": 4.565, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 49s", "remaining_time": "0s"} +{"train_runtime": 1670.3415, "train_samples_per_second": 1.185, "train_steps_per_second": 0.147, "total_flos": 1.2874308328848384e+17, "train_loss": 0.37484119, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 50s", "remaining_time": "0s"} +{"train_dataset": "784.851010±638.096273, min=60.000000, max=4149.000000, size=396", "val_dataset": "325.750000±308.768825, min=104.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 14838.8465M Params (68.8128M Trainable [0.4637%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/checkpoint-245", "best_metric": 0.3948561, "global_step": 245, "log_history": [{"loss": 0.5964576005935669, "token_acc": 0.8541757761259292, "grad_norm": 0.2041328251361847, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 38.49, "train_speed(iter/s)": 0.027366, "epoch": 0.020202020202020204, "step": 1}, {"loss": 0.904187798500061, "token_acc": 0.7896981445582941, "grad_norm": 0.5847204327583313, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 48.42, "train_speed(iter/s)": 0.079647, "epoch": 0.10101010101010101, "step": 5}, {"loss": 0.731820821762085, "token_acc": 0.8111010965900556, "grad_norm": 0.2803400158882141, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 54.88, "train_speed(iter/s)": 0.104425, "epoch": 0.20202020202020202, "step": 10}, {"loss": 0.5775248527526855, "token_acc": 0.8338741721854305, "grad_norm": 0.3598642945289612, "learning_rate": 9.99816643111642e-05, "memory(GiB)": 61.41, "train_speed(iter/s)": 0.115151, "epoch": 0.30303030303030304, "step": 15}, {"loss": 0.5537004947662354, "token_acc": 0.8298178513272996, "grad_norm": 0.1787286400794983, "learning_rate": 9.977554222133292e-05, "memory(GiB)": 61.41, "train_speed(iter/s)": 0.122582, "epoch": 0.40404040404040403, "step": 20}, {"eval_loss": 0.7036610841751099, "eval_token_acc": 0.7283464566929134, "eval_runtime": 0.854, "eval_samples_per_second": 4.684, "eval_steps_per_second": 4.684, "epoch": 0.40404040404040403, "step": 20}, {"loss": 0.5114118099212647, "token_acc": 0.8343587316611453, "grad_norm": 0.12640731036663055, "learning_rate": 9.934132612707632e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.124613, "epoch": 0.5050505050505051, "step": 25}, {"loss": 0.5711065292358398, "token_acc": 0.8206535755074973, "grad_norm": 0.15377166867256165, "learning_rate": 9.868100580255466e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.127818, "epoch": 0.6060606060606061, "step": 30}, {"loss": 0.5313561439514161, "token_acc": 0.8319673548431719, "grad_norm": 0.14515845477581024, "learning_rate": 9.779760713358059e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.130792, "epoch": 0.7070707070707071, "step": 35}, {"loss": 0.42766599655151366, "token_acc": 0.8610677701648464, "grad_norm": 0.15607544779777527, "learning_rate": 9.669517825164434e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.133018, "epoch": 0.8080808080808081, "step": 40}, {"eval_loss": 0.5766888856887817, "eval_token_acc": 0.7391732283464567, "eval_runtime": 0.9472, "eval_samples_per_second": 4.223, "eval_steps_per_second": 4.223, "epoch": 0.8080808080808081, "step": 40}, {"loss": 0.46687989234924315, "token_acc": 0.8420670634344607, "grad_norm": 0.13710518181324005, "learning_rate": 9.537877098354786e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.134336, "epoch": 0.9090909090909091, "step": 45}, {"loss": 0.5958067417144776, "token_acc": 0.8129140278081095, "grad_norm": 0.11743508279323578, "learning_rate": 9.385441770165385e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.136973, "epoch": 1.0, "step": 50}, {"loss": 0.5059752941131592, "token_acc": 0.8353572437164642, "grad_norm": 0.13310575485229492, "learning_rate": 9.212910368083245e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.137155, "epoch": 1.101010101010101, "step": 55}, {"loss": 0.44797539710998535, "token_acc": 0.845841438858742, "grad_norm": 0.15601898729801178, "learning_rate": 9.021073508877845e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.138205, "epoch": 1.202020202020202, "step": 60}, {"eval_loss": 0.5672237277030945, "eval_token_acc": 0.7332677165354331, "eval_runtime": 0.8765, "eval_samples_per_second": 4.563, "eval_steps_per_second": 4.563, "epoch": 1.202020202020202, "step": 60}, {"loss": 0.5194249153137207, "token_acc": 0.8289147716768349, "grad_norm": 0.17286072671413422, "learning_rate": 8.810810275638183e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.138881, "epoch": 1.303030303030303, "step": 65}, {"loss": 0.37634236812591554, "token_acc": 0.8768530150753768, "grad_norm": 0.16092103719711304, "learning_rate": 8.583084189417224e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.139264, "epoch": 1.404040404040404, "step": 70}, {"loss": 0.48749170303344724, "token_acc": 0.8442603208247026, "grad_norm": 0.1762753427028656, "learning_rate": 8.338938793943478e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.140816, "epoch": 1.5050505050505052, "step": 75}, {"loss": 0.45453643798828125, "token_acc": 0.8541121648136036, "grad_norm": 0.32298213243484497, "learning_rate": 8.079492873632554e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.141607, "epoch": 1.606060606060606, "step": 80}, {"eval_loss": 0.5171214938163757, "eval_token_acc": 0.7391732283464567, "eval_runtime": 0.937, "eval_samples_per_second": 4.269, "eval_steps_per_second": 4.269, "epoch": 1.606060606060606, "step": 80}, {"loss": 0.4341718673706055, "token_acc": 0.8526262111167772, "grad_norm": 0.17536115646362305, "learning_rate": 7.805935326811912e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.14176, "epoch": 1.7070707070707072, "step": 85}, {"loss": 0.4266983509063721, "token_acc": 0.856147317604921, "grad_norm": 0.2086581289768219, "learning_rate": 7.519519717652039e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.142225, "epoch": 1.808080808080808, "step": 90}, {"loss": 0.4482563972473145, "token_acc": 0.8495209778658738, "grad_norm": 0.18198587000370026, "learning_rate": 7.221558531769519e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.142524, "epoch": 1.9090909090909092, "step": 95}, {"loss": 0.42622933387756345, "token_acc": 0.8711596082465006, "grad_norm": 0.3201996088027954, "learning_rate": 6.91341716182545e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143465, "epoch": 2.0, "step": 100}, {"eval_loss": 0.4652397632598877, "eval_token_acc": 0.7470472440944882, "eval_runtime": 0.8776, "eval_samples_per_second": 4.558, "eval_steps_per_second": 4.558, "epoch": 2.0, "step": 100}, {"loss": 0.35508630275726316, "token_acc": 0.8730065771691143, "grad_norm": 0.2608080208301544, "learning_rate": 6.5965076506799e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143604, "epoch": 2.101010101010101, "step": 105}, {"loss": 0.3007711172103882, "token_acc": 0.9026669200557456, "grad_norm": 0.18070940673351288, "learning_rate": 6.272282220774091e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.143611, "epoch": 2.202020202020202, "step": 110}, {"loss": 0.3588543891906738, "token_acc": 0.8820030544168476, "grad_norm": 0.2512786388397217, "learning_rate": 5.9422266193915924e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144208, "epoch": 2.303030303030303, "step": 115}, {"loss": 0.32741105556488037, "token_acc": 0.8927774848939535, "grad_norm": 0.4274783730506897, "learning_rate": 5.6078533102935745e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144535, "epoch": 2.404040404040404, "step": 120}, {"eval_loss": 0.44838735461235046, "eval_token_acc": 0.7460629921259843, "eval_runtime": 0.8624, "eval_samples_per_second": 4.638, "eval_steps_per_second": 4.638, "epoch": 2.404040404040404, "step": 120}, {"loss": 0.345516300201416, "token_acc": 0.8747805323407543, "grad_norm": 0.3210482895374298, "learning_rate": 5.270694542927088e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144419, "epoch": 2.505050505050505, "step": 125}, {"loss": 0.384252667427063, "token_acc": 0.8706323113387874, "grad_norm": 0.2985243499279022, "learning_rate": 4.9322953309663916e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144826, "epoch": 2.606060606060606, "step": 130}, {"loss": 0.32880301475524903, "token_acc": 0.8895941823560225, "grad_norm": 0.3411076068878174, "learning_rate": 4.594206372362845e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144704, "epoch": 2.707070707070707, "step": 135}, {"loss": 0.38271095752716067, "token_acc": 0.8675094401576096, "grad_norm": 0.3216821253299713, "learning_rate": 4.2579769433468694e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144971, "epoch": 2.808080808080808, "step": 140}, {"eval_loss": 0.46010422706604004, "eval_token_acc": 0.7480314960629921, "eval_runtime": 0.9301, "eval_samples_per_second": 4.301, "eval_steps_per_second": 4.301, "epoch": 2.808080808080808, "step": 140}, {"loss": 0.3969467878341675, "token_acc": 0.8653660475947067, "grad_norm": 0.36269646883010864, "learning_rate": 3.92514779894488e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.144702, "epoch": 2.909090909090909, "step": 145}, {"loss": 0.4235672473907471, "token_acc": 0.865672935263021, "grad_norm": 0.5357916355133057, "learning_rate": 3.597244112544208e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145348, "epoch": 3.0, "step": 150}, {"loss": 0.2840510606765747, "token_acc": 0.9106051814178926, "grad_norm": 0.43691495060920715, "learning_rate": 3.275768486860149e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145398, "epoch": 3.101010101010101, "step": 155}, {"loss": 0.2537685871124268, "token_acc": 0.9156227546780794, "grad_norm": 0.2997167408466339, "learning_rate": 2.962194068331996e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145506, "epoch": 3.202020202020202, "step": 160}, {"eval_loss": 0.4409746527671814, "eval_token_acc": 0.7490157480314961, "eval_runtime": 0.8875, "eval_samples_per_second": 4.507, "eval_steps_per_second": 4.507, "epoch": 3.202020202020202, "step": 160}, {"loss": 0.265443754196167, "token_acc": 0.9111324895529568, "grad_norm": 0.5427218079566956, "learning_rate": 2.65795779650105e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145292, "epoch": 3.303030303030303, "step": 165}, {"loss": 0.26773905754089355, "token_acc": 0.9053480475382003, "grad_norm": 0.4990173876285553, "learning_rate": 2.3644538193049625e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.145583, "epoch": 3.404040404040404, "step": 170}, {"loss": 0.2679251194000244, "token_acc": 0.9111967817633255, "grad_norm": 0.4650072455406189, "learning_rate": 2.08302710446253e-05, "memory(GiB)": 70.81, "train_speed(iter/s)": 0.14568, "epoch": 3.505050505050505, "step": 175}, {"loss": 0.2113558053970337, "token_acc": 0.922006574503535, "grad_norm": 0.5438686013221741, "learning_rate": 1.8149672762244624e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146101, "epoch": 3.606060606060606, "step": 180}, {"eval_loss": 0.40819939970970154, "eval_token_acc": 0.7421259842519685, "eval_runtime": 0.8545, "eval_samples_per_second": 4.681, "eval_steps_per_second": 4.681, "epoch": 3.606060606060606, "step": 180}, {"loss": 0.23794655799865722, "token_acc": 0.9139989875227063, "grad_norm": 0.4500384032726288, "learning_rate": 1.561502705732883e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.145854, "epoch": 3.707070707070707, "step": 185}, {"loss": 0.28251683712005615, "token_acc": 0.8974243478973015, "grad_norm": 0.28911277651786804, "learning_rate": 1.3237948820702495e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146186, "epoch": 3.808080808080808, "step": 190}, {"loss": 0.23283705711364747, "token_acc": 0.9216532524592713, "grad_norm": 0.5827483534812927, "learning_rate": 1.102933089792042e-05, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146399, "epoch": 3.909090909090909, "step": 195}, {"loss": 0.2710264205932617, "token_acc": 0.9085716896646258, "grad_norm": 0.6791924238204956, "learning_rate": 8.999294173332058e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146896, "epoch": 4.0, "step": 200}, {"eval_loss": 0.3976582884788513, "eval_token_acc": 0.7460629921259843, "eval_runtime": 0.9014, "eval_samples_per_second": 4.438, "eval_steps_per_second": 4.438, "epoch": 4.0, "step": 200}, {"loss": 0.16443511247634887, "token_acc": 0.9361095045305572, "grad_norm": 0.6375091671943665, "learning_rate": 7.157141191620548e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146797, "epoch": 4.101010101010101, "step": 205}, {"loss": 0.23344297409057618, "token_acc": 0.9198150078165711, "grad_norm": 0.42739006876945496, "learning_rate": 5.5113135293435815e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146669, "epoch": 4.202020202020202, "step": 210}, {"loss": 0.24862170219421387, "token_acc": 0.9194660247291826, "grad_norm": 0.38036471605300903, "learning_rate": 4.069353111818913e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146512, "epoch": 4.303030303030303, "step": 215}, {"loss": 0.2238011837005615, "token_acc": 0.9291122760132936, "grad_norm": 0.47875192761421204, "learning_rate": 2.8378676526178482e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146645, "epoch": 4.404040404040404, "step": 220}, {"eval_loss": 0.3966984152793884, "eval_token_acc": 0.7460629921259843, "eval_runtime": 1.0375, "eval_samples_per_second": 3.856, "eval_steps_per_second": 3.856, "epoch": 4.404040404040404, "step": 220}, {"loss": 0.17960745096206665, "token_acc": 0.9339186867722847, "grad_norm": 0.4872428774833679, "learning_rate": 1.8225003740388547e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146615, "epoch": 4.505050505050505, "step": 225}, {"loss": 0.17248222827911378, "token_acc": 0.9405840068228891, "grad_norm": 0.5575739741325378, "learning_rate": 1.0279041473154116e-06, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146886, "epoch": 4.606060606060606, "step": 230}, {"loss": 0.2282116413116455, "token_acc": 0.92505189062359, "grad_norm": 0.4236765205860138, "learning_rate": 4.577201710596612e-07, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.147141, "epoch": 4.707070707070707, "step": 235}, {"loss": 0.18331977128982543, "token_acc": 0.9365260900643316, "grad_norm": 0.5662180781364441, "learning_rate": 1.1456128564660273e-07, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146712, "epoch": 4.808080808080808, "step": 240}, {"eval_loss": 0.3963746726512909, "eval_token_acc": 0.7470472440944882, "eval_runtime": 0.9453, "eval_samples_per_second": 4.231, "eval_steps_per_second": 4.231, "epoch": 4.808080808080808, "step": 240}, {"loss": 0.21774885654449463, "token_acc": 0.9170842824601366, "grad_norm": 0.5550493001937866, "learning_rate": 0.0, "memory(GiB)": 70.82, "train_speed(iter/s)": 0.146816, "epoch": 4.909090909090909, "step": 245}, {"eval_loss": 0.3948560953140259, "eval_token_acc": 0.7450787401574803, "eval_runtime": 0.8762, "eval_samples_per_second": 4.565, "eval_steps_per_second": 4.565, "epoch": 4.909090909090909, "step": 245}, {"train_runtime": 1670.3415, "train_samples_per_second": 1.185, "train_steps_per_second": 0.147, "total_flos": 1.2874308328848384e+17, "train_loss": 0.374841186221765, "epoch": 4.909090909090909, "step": 245}], "memory": 70.81640625} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs/events.out.tfevents.1737755842.kml-dtmachine-18088-prod.61512.0 b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs/events.out.tfevents.1737755842.kml-dtmachine-18088-prod.61512.0 new file mode 100644 index 0000000000000000000000000000000000000000..1f773fb320e9321b2eb0e8bb7d3ce041bdcd9293 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_random20/v0-20250124-215528/runs/events.out.tfevents.1737755842.kml-dtmachine-18088-prod.61512.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de3acfb9a4e799134f7b8f3488755e4511faaacfa2b3356ee9b1685211e2aa24 +size 29528 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7117a9ad365fb868e74b28952e78d4993087857b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/README.md b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e053b4864e625b192524b138960982274555d7f0 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "down_proj", + "q_proj", + "v_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b68e739038b8a129451fbab822f7a6fbd0065a8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e06a473317b2b18fdf1af42f9b8911005e77f0d66385299cd35d42211c9524b +size 275341720 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/additional_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7117a9ad365fb868e74b28952e78d4993087857b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/optimizer.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5b989a32b88f4f44662125cb84a824756db7145 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d8e096a6c92c3d511a871a9b17ae7be6b83d58ea860190856a8492e73bc2f6 +size 551070514 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/rng_state.pth b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ff30ec5cd0d2c60602405d1cc6dd10cc8a701c7 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0653c78d8bc6be578c9cd744bad660e1256d39e5b6bb0e969bade152bd24dde8 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/scheduler.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a7d8bc4abba06c30f7a7a85d5d4173984e4c9c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb96daf4c69d48b38d52e8ae266563af1b957cba209e2ff4307709a50fc6770 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/trainer_state.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9347d4a85d7d4d15fde722b3676d0a8c6c77042d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_metric": 0.38130462, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240", + "epoch": 4.808080808080808, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.21148298680782318, + "learning_rate": 7.692307692307694e-06, + "loss": 0.6009877920150757, + "memory(GiB)": 38.38, + "step": 1, + "token_acc": 0.8543944031482291, + "train_speed(iter/s)": 0.028445 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.5691107511520386, + "learning_rate": 3.846153846153846e-05, + "loss": 0.9119688868522644, + "memory(GiB)": 48.25, + "step": 5, + "token_acc": 0.7889227360841872, + "train_speed(iter/s)": 0.08115 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.27697205543518066, + "learning_rate": 7.692307692307693e-05, + "loss": 0.7359254837036133, + "memory(GiB)": 54.71, + "step": 10, + "token_acc": 0.8115517500375544, + "train_speed(iter/s)": 0.106586 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.48245590925216675, + "learning_rate": 9.99816643111642e-05, + "loss": 0.5770383834838867, + "memory(GiB)": 61.19, + "step": 15, + "token_acc": 0.8346688741721854, + "train_speed(iter/s)": 0.117053 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.17850157618522644, + "learning_rate": 9.977554222133292e-05, + "loss": 0.5489081859588623, + "memory(GiB)": 61.19, + "step": 20, + "token_acc": 0.8306888325065324, + "train_speed(iter/s)": 0.124147 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.6602479815483093, + "eval_runtime": 0.8576, + "eval_samples_per_second": 4.664, + "eval_steps_per_second": 4.664, + "eval_token_acc": 0.7263779527559056, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.12782400846481323, + "learning_rate": 9.934132612707632e-05, + "loss": 0.5102836608886718, + "memory(GiB)": 70.6, + "step": 25, + "token_acc": 0.8355892096545197, + "train_speed(iter/s)": 0.126266 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.152016744017601, + "learning_rate": 9.868100580255466e-05, + "loss": 0.5704391956329345, + "memory(GiB)": 70.6, + "step": 30, + "token_acc": 0.8209851527097226, + "train_speed(iter/s)": 0.129292 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.14646950364112854, + "learning_rate": 9.779760713358059e-05, + "loss": 0.5319510459899902, + "memory(GiB)": 70.6, + "step": 35, + "token_acc": 0.8311028114949683, + "train_speed(iter/s)": 0.131949 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.14991414546966553, + "learning_rate": 9.669517825164434e-05, + "loss": 0.42771115303039553, + "memory(GiB)": 70.61, + "step": 40, + "token_acc": 0.8606268231463265, + "train_speed(iter/s)": 0.133725 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.5462462902069092, + "eval_runtime": 0.8819, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 4.536, + "eval_token_acc": 0.7362204724409449, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.1364576518535614, + "learning_rate": 9.537877098354786e-05, + "loss": 0.4666141986846924, + "memory(GiB)": 70.61, + "step": 45, + "token_acc": 0.8414501379010016, + "train_speed(iter/s)": 0.134892 + }, + { + "epoch": 1.0, + "grad_norm": 0.11755681782960892, + "learning_rate": 9.385441770165385e-05, + "loss": 0.5955597877502441, + "memory(GiB)": 70.61, + "step": 50, + "token_acc": 0.8128776297590449, + "train_speed(iter/s)": 0.13723 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.12964965403079987, + "learning_rate": 9.212910368083245e-05, + "loss": 0.5069296360015869, + "memory(GiB)": 70.61, + "step": 55, + "token_acc": 0.8350234910528613, + "train_speed(iter/s)": 0.137114 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.15529637038707733, + "learning_rate": 9.021073508877845e-05, + "loss": 0.448743200302124, + "memory(GiB)": 70.61, + "step": 60, + "token_acc": 0.8458073103307054, + "train_speed(iter/s)": 0.137926 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.5411906838417053, + "eval_runtime": 1.0083, + "eval_samples_per_second": 3.967, + "eval_steps_per_second": 3.967, + "eval_token_acc": 0.7362204724409449, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.17046210169792175, + "learning_rate": 8.810810275638183e-05, + "loss": 0.5194157123565674, + "memory(GiB)": 70.62, + "step": 65, + "token_acc": 0.829247550378998, + "train_speed(iter/s)": 0.138628 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15912939608097076, + "learning_rate": 8.583084189417224e-05, + "loss": 0.37682528495788575, + "memory(GiB)": 70.62, + "step": 70, + "token_acc": 0.8765703517587939, + "train_speed(iter/s)": 0.138973 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.16839051246643066, + "learning_rate": 8.338938793943478e-05, + "loss": 0.48751306533813477, + "memory(GiB)": 70.62, + "step": 75, + "token_acc": 0.8434895707885737, + "train_speed(iter/s)": 0.140411 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.32181790471076965, + "learning_rate": 8.079492873632554e-05, + "loss": 0.4565874099731445, + "memory(GiB)": 70.62, + "step": 80, + "token_acc": 0.8534172661870504, + "train_speed(iter/s)": 0.141042 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.4897439479827881, + "eval_runtime": 0.956, + "eval_samples_per_second": 4.184, + "eval_steps_per_second": 4.184, + "eval_token_acc": 0.7421259842519685, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.1729101538658142, + "learning_rate": 7.805935326811912e-05, + "loss": 0.4354836463928223, + "memory(GiB)": 70.62, + "step": 85, + "token_acc": 0.8526262111167772, + "train_speed(iter/s)": 0.141128 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.2045697569847107, + "learning_rate": 7.519519717652039e-05, + "loss": 0.4280831336975098, + "memory(GiB)": 70.62, + "step": 90, + "token_acc": 0.855290819901892, + "train_speed(iter/s)": 0.141744 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.1761767864227295, + "learning_rate": 7.221558531769519e-05, + "loss": 0.449586820602417, + "memory(GiB)": 70.62, + "step": 95, + "token_acc": 0.8496861579121242, + "train_speed(iter/s)": 0.142084 + }, + { + "epoch": 2.0, + "grad_norm": 0.3147103786468506, + "learning_rate": 6.91341716182545e-05, + "loss": 0.42734193801879883, + "memory(GiB)": 70.62, + "step": 100, + "token_acc": 0.8712043289656097, + "train_speed(iter/s)": 0.143097 + }, + { + "epoch": 2.0, + "eval_loss": 0.4399929940700531, + "eval_runtime": 0.9538, + "eval_samples_per_second": 4.194, + "eval_steps_per_second": 4.194, + "eval_token_acc": 0.7480314960629921, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.2544735074043274, + "learning_rate": 6.5965076506799e-05, + "loss": 0.3569289445877075, + "memory(GiB)": 70.62, + "step": 105, + "token_acc": 0.8721506442021804, + "train_speed(iter/s)": 0.143239 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17970111966133118, + "learning_rate": 6.272282220774091e-05, + "loss": 0.3037956953048706, + "memory(GiB)": 70.62, + "step": 110, + "token_acc": 0.9014316482959585, + "train_speed(iter/s)": 0.143197 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.24640092253684998, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.36233837604522706, + "memory(GiB)": 70.62, + "step": 115, + "token_acc": 0.88071698416526, + "train_speed(iter/s)": 0.143856 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.42206674814224243, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.33151195049285886, + "memory(GiB)": 70.62, + "step": 120, + "token_acc": 0.8921691877205077, + "train_speed(iter/s)": 0.144277 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.4253988265991211, + "eval_runtime": 0.8511, + "eval_samples_per_second": 4.7, + "eval_steps_per_second": 4.7, + "eval_token_acc": 0.7470472440944882, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.3131689429283142, + "learning_rate": 5.270694542927088e-05, + "loss": 0.34746339321136477, + "memory(GiB)": 70.62, + "step": 125, + "token_acc": 0.8732354800196643, + "train_speed(iter/s)": 0.144095 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.2918804883956909, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.38753666877746584, + "memory(GiB)": 70.62, + "step": 130, + "token_acc": 0.8696001223335117, + "train_speed(iter/s)": 0.144472 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.3330991566181183, + "learning_rate": 4.594206372362845e-05, + "loss": 0.3318129062652588, + "memory(GiB)": 70.62, + "step": 135, + "token_acc": 0.8886697685884202, + "train_speed(iter/s)": 0.144381 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.31272196769714355, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.38605782985687254, + "memory(GiB)": 70.62, + "step": 140, + "token_acc": 0.8668198982104744, + "train_speed(iter/s)": 0.144688 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.4327184557914734, + "eval_runtime": 0.8971, + "eval_samples_per_second": 4.459, + "eval_steps_per_second": 4.459, + "eval_token_acc": 0.7470472440944882, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.35478416085243225, + "learning_rate": 3.92514779894488e-05, + "loss": 0.4013655662536621, + "memory(GiB)": 70.62, + "step": 145, + "token_acc": 0.8638609643891634, + "train_speed(iter/s)": 0.144401 + }, + { + "epoch": 3.0, + "grad_norm": 0.5760998725891113, + "learning_rate": 3.597244112544208e-05, + "loss": 0.428325891494751, + "memory(GiB)": 70.62, + "step": 150, + "token_acc": 0.8648929716613225, + "train_speed(iter/s)": 0.144976 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.4245682656764984, + "learning_rate": 3.275768486860149e-05, + "loss": 0.28979499340057374, + "memory(GiB)": 70.62, + "step": 155, + "token_acc": 0.908249991466703, + "train_speed(iter/s)": 0.145038 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.2930150330066681, + "learning_rate": 2.962194068331996e-05, + "loss": 0.25866198539733887, + "memory(GiB)": 70.62, + "step": 160, + "token_acc": 0.9131860922807785, + "train_speed(iter/s)": 0.145129 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.40826284885406494, + "eval_runtime": 0.8819, + "eval_samples_per_second": 4.535, + "eval_steps_per_second": 4.535, + "eval_token_acc": 0.7490157480314961, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.5396267771720886, + "learning_rate": 2.65795779650105e-05, + "loss": 0.27140424251556394, + "memory(GiB)": 70.62, + "step": 165, + "token_acc": 0.9090280491837778, + "train_speed(iter/s)": 0.144897 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.4830700159072876, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.27426331043243407, + "memory(GiB)": 70.62, + "step": 170, + "token_acc": 0.905309461336626, + "train_speed(iter/s)": 0.145211 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.489519327878952, + "learning_rate": 2.08302710446253e-05, + "loss": 0.27257814407348635, + "memory(GiB)": 70.62, + "step": 175, + "token_acc": 0.9106268856855515, + "train_speed(iter/s)": 0.145332 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.5418012738227844, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.21682074069976806, + "memory(GiB)": 70.62, + "step": 180, + "token_acc": 0.9202954023506101, + "train_speed(iter/s)": 0.145793 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.40150904655456543, + "eval_runtime": 0.9846, + "eval_samples_per_second": 4.063, + "eval_steps_per_second": 4.063, + "eval_token_acc": 0.7450787401574803, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.4423423409461975, + "learning_rate": 1.561502705732883e-05, + "loss": 0.24239687919616698, + "memory(GiB)": 70.62, + "step": 185, + "token_acc": 0.9123015991185491, + "train_speed(iter/s)": 0.14543 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.2829873263835907, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.2875248908996582, + "memory(GiB)": 70.62, + "step": 190, + "token_acc": 0.8957864133327873, + "train_speed(iter/s)": 0.145672 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.5757401585578918, + "learning_rate": 1.102933089792042e-05, + "loss": 0.2387687921524048, + "memory(GiB)": 70.62, + "step": 195, + "token_acc": 0.9186982386562463, + "train_speed(iter/s)": 0.14583 + }, + { + "epoch": 4.0, + "grad_norm": 0.6690001487731934, + "learning_rate": 8.999294173332058e-06, + "loss": 0.2767722368240356, + "memory(GiB)": 70.62, + "step": 200, + "token_acc": 0.9056017545462853, + "train_speed(iter/s)": 0.146225 + }, + { + "epoch": 4.0, + "eval_loss": 0.38474351167678833, + "eval_runtime": 0.8708, + "eval_samples_per_second": 4.593, + "eval_steps_per_second": 4.593, + "eval_token_acc": 0.7480314960629921, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.6503894925117493, + "learning_rate": 7.157141191620548e-06, + "loss": 0.16934740543365479, + "memory(GiB)": 70.62, + "step": 205, + "token_acc": 0.9342587237324079, + "train_speed(iter/s)": 0.14617 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.4198131859302521, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.24090688228607177, + "memory(GiB)": 70.62, + "step": 210, + "token_acc": 0.9170140698280355, + "train_speed(iter/s)": 0.146031 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.37690314650535583, + "learning_rate": 4.069353111818913e-06, + "loss": 0.25371742248535156, + "memory(GiB)": 70.62, + "step": 215, + "token_acc": 0.9176332202647992, + "train_speed(iter/s)": 0.1459 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.47662192583084106, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.23050858974456787, + "memory(GiB)": 70.62, + "step": 220, + "token_acc": 0.9262000205570974, + "train_speed(iter/s)": 0.146025 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.3850444257259369, + "eval_runtime": 0.9782, + "eval_samples_per_second": 4.089, + "eval_steps_per_second": 4.089, + "eval_token_acc": 0.7480314960629921, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.48112934827804565, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.1852494955062866, + "memory(GiB)": 70.62, + "step": 225, + "token_acc": 0.9320099255583126, + "train_speed(iter/s)": 0.146041 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.5440866947174072, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.17858517169952393, + "memory(GiB)": 70.62, + "step": 230, + "token_acc": 0.9379441985135849, + "train_speed(iter/s)": 0.146314 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.4088594615459442, + "learning_rate": 4.577201710596612e-07, + "loss": 0.23350396156311035, + "memory(GiB)": 70.62, + "step": 235, + "token_acc": 0.9229762656799928, + "train_speed(iter/s)": 0.146688 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.5109623670578003, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.18792340755462647, + "memory(GiB)": 70.62, + "step": 240, + "token_acc": 0.9343817012151536, + "train_speed(iter/s)": 0.146335 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.38130462169647217, + "eval_runtime": 0.9483, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 4.218, + "eval_token_acc": 0.7470472440944882, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.250027398058066e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/training_args.bin b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a48f0b37bb73726c0ffea803b2ec9c9843f83f0 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4bc0a4e65f33549a5e111ad696a52e610f49087c4f1069f7c9bd8d1807492d +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/README.md b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4c2a1aa3a5c5c41634276dcdd9d0d193ad8eb4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e053b4864e625b192524b138960982274555d7f0 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "down_proj", + "q_proj", + "v_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..51b7d68fdafc95ccc0f0e751a67f27f3f548558c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b85c35c05ec224da4bbb598e018b85a576f81f455e32d74843bc49e5925765b +size 275341720 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/additional_config.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/args.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7117a9ad365fb868e74b28952e78d4993087857b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/args.json @@ -0,0 +1,305 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 8192, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 8, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-14b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/optimizer.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c5922273ebdade4963d8a6448edf2bd32ed18b9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dada549db69c82ef4e527a3b6fde1ebf5f4cd43b2007fb345a393caac5f72e7 +size 551070514 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/rng_state.pth b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e17f8f38292d45aa269dbbb3d344654c12f1ec46 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a8c90fa12d8f3c889782ed2bc0cfbdb034284a74ffdd06d42891f04390c442 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/scheduler.pt b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c07b610e877e513fda3813a64af716a38654c2f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ba022925b97a0c60fdb73ede217e52b3b55c5065f112ff19fea77b6a69dd5d +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/trainer_state.json b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56223e8bda150364a9e3c8d941fd95bebe26705d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/trainer_state.json @@ -0,0 +1,650 @@ +{ + "best_metric": 0.38130462, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240", + "epoch": 4.909090909090909, + "eval_steps": 20, + "global_step": 245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020202020202020204, + "grad_norm": 0.21148298680782318, + "learning_rate": 7.692307692307694e-06, + "loss": 0.6009877920150757, + "memory(GiB)": 38.38, + "step": 1, + "token_acc": 0.8543944031482291, + "train_speed(iter/s)": 0.028445 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.5691107511520386, + "learning_rate": 3.846153846153846e-05, + "loss": 0.9119688868522644, + "memory(GiB)": 48.25, + "step": 5, + "token_acc": 0.7889227360841872, + "train_speed(iter/s)": 0.08115 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.27697205543518066, + "learning_rate": 7.692307692307693e-05, + "loss": 0.7359254837036133, + "memory(GiB)": 54.71, + "step": 10, + "token_acc": 0.8115517500375544, + "train_speed(iter/s)": 0.106586 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.48245590925216675, + "learning_rate": 9.99816643111642e-05, + "loss": 0.5770383834838867, + "memory(GiB)": 61.19, + "step": 15, + "token_acc": 0.8346688741721854, + "train_speed(iter/s)": 0.117053 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.17850157618522644, + "learning_rate": 9.977554222133292e-05, + "loss": 0.5489081859588623, + "memory(GiB)": 61.19, + "step": 20, + "token_acc": 0.8306888325065324, + "train_speed(iter/s)": 0.124147 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.6602479815483093, + "eval_runtime": 0.8576, + "eval_samples_per_second": 4.664, + "eval_steps_per_second": 4.664, + "eval_token_acc": 0.7263779527559056, + "step": 20 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.12782400846481323, + "learning_rate": 9.934132612707632e-05, + "loss": 0.5102836608886718, + "memory(GiB)": 70.6, + "step": 25, + "token_acc": 0.8355892096545197, + "train_speed(iter/s)": 0.126266 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.152016744017601, + "learning_rate": 9.868100580255466e-05, + "loss": 0.5704391956329345, + "memory(GiB)": 70.6, + "step": 30, + "token_acc": 0.8209851527097226, + "train_speed(iter/s)": 0.129292 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.14646950364112854, + "learning_rate": 9.779760713358059e-05, + "loss": 0.5319510459899902, + "memory(GiB)": 70.6, + "step": 35, + "token_acc": 0.8311028114949683, + "train_speed(iter/s)": 0.131949 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.14991414546966553, + "learning_rate": 9.669517825164434e-05, + "loss": 0.42771115303039553, + "memory(GiB)": 70.61, + "step": 40, + "token_acc": 0.8606268231463265, + "train_speed(iter/s)": 0.133725 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.5462462902069092, + "eval_runtime": 0.8819, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 4.536, + "eval_token_acc": 0.7362204724409449, + "step": 40 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.1364576518535614, + "learning_rate": 9.537877098354786e-05, + "loss": 0.4666141986846924, + "memory(GiB)": 70.61, + "step": 45, + "token_acc": 0.8414501379010016, + "train_speed(iter/s)": 0.134892 + }, + { + "epoch": 1.0, + "grad_norm": 0.11755681782960892, + "learning_rate": 9.385441770165385e-05, + "loss": 0.5955597877502441, + "memory(GiB)": 70.61, + "step": 50, + "token_acc": 0.8128776297590449, + "train_speed(iter/s)": 0.13723 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.12964965403079987, + "learning_rate": 9.212910368083245e-05, + "loss": 0.5069296360015869, + "memory(GiB)": 70.61, + "step": 55, + "token_acc": 0.8350234910528613, + "train_speed(iter/s)": 0.137114 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.15529637038707733, + "learning_rate": 9.021073508877845e-05, + "loss": 0.448743200302124, + "memory(GiB)": 70.61, + "step": 60, + "token_acc": 0.8458073103307054, + "train_speed(iter/s)": 0.137926 + }, + { + "epoch": 1.202020202020202, + "eval_loss": 0.5411906838417053, + "eval_runtime": 1.0083, + "eval_samples_per_second": 3.967, + "eval_steps_per_second": 3.967, + "eval_token_acc": 0.7362204724409449, + "step": 60 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.17046210169792175, + "learning_rate": 8.810810275638183e-05, + "loss": 0.5194157123565674, + "memory(GiB)": 70.62, + "step": 65, + "token_acc": 0.829247550378998, + "train_speed(iter/s)": 0.138628 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.15912939608097076, + "learning_rate": 8.583084189417224e-05, + "loss": 0.37682528495788575, + "memory(GiB)": 70.62, + "step": 70, + "token_acc": 0.8765703517587939, + "train_speed(iter/s)": 0.138973 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.16839051246643066, + "learning_rate": 8.338938793943478e-05, + "loss": 0.48751306533813477, + "memory(GiB)": 70.62, + "step": 75, + "token_acc": 0.8434895707885737, + "train_speed(iter/s)": 0.140411 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.32181790471076965, + "learning_rate": 8.079492873632554e-05, + "loss": 0.4565874099731445, + "memory(GiB)": 70.62, + "step": 80, + "token_acc": 0.8534172661870504, + "train_speed(iter/s)": 0.141042 + }, + { + "epoch": 1.606060606060606, + "eval_loss": 0.4897439479827881, + "eval_runtime": 0.956, + "eval_samples_per_second": 4.184, + "eval_steps_per_second": 4.184, + "eval_token_acc": 0.7421259842519685, + "step": 80 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.1729101538658142, + "learning_rate": 7.805935326811912e-05, + "loss": 0.4354836463928223, + "memory(GiB)": 70.62, + "step": 85, + "token_acc": 0.8526262111167772, + "train_speed(iter/s)": 0.141128 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.2045697569847107, + "learning_rate": 7.519519717652039e-05, + "loss": 0.4280831336975098, + "memory(GiB)": 70.62, + "step": 90, + "token_acc": 0.855290819901892, + "train_speed(iter/s)": 0.141744 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.1761767864227295, + "learning_rate": 7.221558531769519e-05, + "loss": 0.449586820602417, + "memory(GiB)": 70.62, + "step": 95, + "token_acc": 0.8496861579121242, + "train_speed(iter/s)": 0.142084 + }, + { + "epoch": 2.0, + "grad_norm": 0.3147103786468506, + "learning_rate": 6.91341716182545e-05, + "loss": 0.42734193801879883, + "memory(GiB)": 70.62, + "step": 100, + "token_acc": 0.8712043289656097, + "train_speed(iter/s)": 0.143097 + }, + { + "epoch": 2.0, + "eval_loss": 0.4399929940700531, + "eval_runtime": 0.9538, + "eval_samples_per_second": 4.194, + "eval_steps_per_second": 4.194, + "eval_token_acc": 0.7480314960629921, + "step": 100 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.2544735074043274, + "learning_rate": 6.5965076506799e-05, + "loss": 0.3569289445877075, + "memory(GiB)": 70.62, + "step": 105, + "token_acc": 0.8721506442021804, + "train_speed(iter/s)": 0.143239 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17970111966133118, + "learning_rate": 6.272282220774091e-05, + "loss": 0.3037956953048706, + "memory(GiB)": 70.62, + "step": 110, + "token_acc": 0.9014316482959585, + "train_speed(iter/s)": 0.143197 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.24640092253684998, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.36233837604522706, + "memory(GiB)": 70.62, + "step": 115, + "token_acc": 0.88071698416526, + "train_speed(iter/s)": 0.143856 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.42206674814224243, + "learning_rate": 5.6078533102935745e-05, + "loss": 0.33151195049285886, + "memory(GiB)": 70.62, + "step": 120, + "token_acc": 0.8921691877205077, + "train_speed(iter/s)": 0.144277 + }, + { + "epoch": 2.404040404040404, + "eval_loss": 0.4253988265991211, + "eval_runtime": 0.8511, + "eval_samples_per_second": 4.7, + "eval_steps_per_second": 4.7, + "eval_token_acc": 0.7470472440944882, + "step": 120 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.3131689429283142, + "learning_rate": 5.270694542927088e-05, + "loss": 0.34746339321136477, + "memory(GiB)": 70.62, + "step": 125, + "token_acc": 0.8732354800196643, + "train_speed(iter/s)": 0.144095 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.2918804883956909, + "learning_rate": 4.9322953309663916e-05, + "loss": 0.38753666877746584, + "memory(GiB)": 70.62, + "step": 130, + "token_acc": 0.8696001223335117, + "train_speed(iter/s)": 0.144472 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.3330991566181183, + "learning_rate": 4.594206372362845e-05, + "loss": 0.3318129062652588, + "memory(GiB)": 70.62, + "step": 135, + "token_acc": 0.8886697685884202, + "train_speed(iter/s)": 0.144381 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.31272196769714355, + "learning_rate": 4.2579769433468694e-05, + "loss": 0.38605782985687254, + "memory(GiB)": 70.62, + "step": 140, + "token_acc": 0.8668198982104744, + "train_speed(iter/s)": 0.144688 + }, + { + "epoch": 2.808080808080808, + "eval_loss": 0.4327184557914734, + "eval_runtime": 0.8971, + "eval_samples_per_second": 4.459, + "eval_steps_per_second": 4.459, + "eval_token_acc": 0.7470472440944882, + "step": 140 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.35478416085243225, + "learning_rate": 3.92514779894488e-05, + "loss": 0.4013655662536621, + "memory(GiB)": 70.62, + "step": 145, + "token_acc": 0.8638609643891634, + "train_speed(iter/s)": 0.144401 + }, + { + "epoch": 3.0, + "grad_norm": 0.5760998725891113, + "learning_rate": 3.597244112544208e-05, + "loss": 0.428325891494751, + "memory(GiB)": 70.62, + "step": 150, + "token_acc": 0.8648929716613225, + "train_speed(iter/s)": 0.144976 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 0.4245682656764984, + "learning_rate": 3.275768486860149e-05, + "loss": 0.28979499340057374, + "memory(GiB)": 70.62, + "step": 155, + "token_acc": 0.908249991466703, + "train_speed(iter/s)": 0.145038 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 0.2930150330066681, + "learning_rate": 2.962194068331996e-05, + "loss": 0.25866198539733887, + "memory(GiB)": 70.62, + "step": 160, + "token_acc": 0.9131860922807785, + "train_speed(iter/s)": 0.145129 + }, + { + "epoch": 3.202020202020202, + "eval_loss": 0.40826284885406494, + "eval_runtime": 0.8819, + "eval_samples_per_second": 4.535, + "eval_steps_per_second": 4.535, + "eval_token_acc": 0.7490157480314961, + "step": 160 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.5396267771720886, + "learning_rate": 2.65795779650105e-05, + "loss": 0.27140424251556394, + "memory(GiB)": 70.62, + "step": 165, + "token_acc": 0.9090280491837778, + "train_speed(iter/s)": 0.144897 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.4830700159072876, + "learning_rate": 2.3644538193049625e-05, + "loss": 0.27426331043243407, + "memory(GiB)": 70.62, + "step": 170, + "token_acc": 0.905309461336626, + "train_speed(iter/s)": 0.145211 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 0.489519327878952, + "learning_rate": 2.08302710446253e-05, + "loss": 0.27257814407348635, + "memory(GiB)": 70.62, + "step": 175, + "token_acc": 0.9106268856855515, + "train_speed(iter/s)": 0.145332 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.5418012738227844, + "learning_rate": 1.8149672762244624e-05, + "loss": 0.21682074069976806, + "memory(GiB)": 70.62, + "step": 180, + "token_acc": 0.9202954023506101, + "train_speed(iter/s)": 0.145793 + }, + { + "epoch": 3.606060606060606, + "eval_loss": 0.40150904655456543, + "eval_runtime": 0.9846, + "eval_samples_per_second": 4.063, + "eval_steps_per_second": 4.063, + "eval_token_acc": 0.7450787401574803, + "step": 180 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 0.4423423409461975, + "learning_rate": 1.561502705732883e-05, + "loss": 0.24239687919616698, + "memory(GiB)": 70.62, + "step": 185, + "token_acc": 0.9123015991185491, + "train_speed(iter/s)": 0.14543 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.2829873263835907, + "learning_rate": 1.3237948820702495e-05, + "loss": 0.2875248908996582, + "memory(GiB)": 70.62, + "step": 190, + "token_acc": 0.8957864133327873, + "train_speed(iter/s)": 0.145672 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.5757401585578918, + "learning_rate": 1.102933089792042e-05, + "loss": 0.2387687921524048, + "memory(GiB)": 70.62, + "step": 195, + "token_acc": 0.9186982386562463, + "train_speed(iter/s)": 0.14583 + }, + { + "epoch": 4.0, + "grad_norm": 0.6690001487731934, + "learning_rate": 8.999294173332058e-06, + "loss": 0.2767722368240356, + "memory(GiB)": 70.62, + "step": 200, + "token_acc": 0.9056017545462853, + "train_speed(iter/s)": 0.146225 + }, + { + "epoch": 4.0, + "eval_loss": 0.38474351167678833, + "eval_runtime": 0.8708, + "eval_samples_per_second": 4.593, + "eval_steps_per_second": 4.593, + "eval_token_acc": 0.7480314960629921, + "step": 200 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 0.6503894925117493, + "learning_rate": 7.157141191620548e-06, + "loss": 0.16934740543365479, + "memory(GiB)": 70.62, + "step": 205, + "token_acc": 0.9342587237324079, + "train_speed(iter/s)": 0.14617 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.4198131859302521, + "learning_rate": 5.5113135293435815e-06, + "loss": 0.24090688228607177, + "memory(GiB)": 70.62, + "step": 210, + "token_acc": 0.9170140698280355, + "train_speed(iter/s)": 0.146031 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.37690314650535583, + "learning_rate": 4.069353111818913e-06, + "loss": 0.25371742248535156, + "memory(GiB)": 70.62, + "step": 215, + "token_acc": 0.9176332202647992, + "train_speed(iter/s)": 0.1459 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.47662192583084106, + "learning_rate": 2.8378676526178482e-06, + "loss": 0.23050858974456787, + "memory(GiB)": 70.62, + "step": 220, + "token_acc": 0.9262000205570974, + "train_speed(iter/s)": 0.146025 + }, + { + "epoch": 4.404040404040404, + "eval_loss": 0.3850444257259369, + "eval_runtime": 0.9782, + "eval_samples_per_second": 4.089, + "eval_steps_per_second": 4.089, + "eval_token_acc": 0.7480314960629921, + "step": 220 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 0.48112934827804565, + "learning_rate": 1.8225003740388547e-06, + "loss": 0.1852494955062866, + "memory(GiB)": 70.62, + "step": 225, + "token_acc": 0.9320099255583126, + "train_speed(iter/s)": 0.146041 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.5440866947174072, + "learning_rate": 1.0279041473154116e-06, + "loss": 0.17858517169952393, + "memory(GiB)": 70.62, + "step": 230, + "token_acc": 0.9379441985135849, + "train_speed(iter/s)": 0.146314 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 0.4088594615459442, + "learning_rate": 4.577201710596612e-07, + "loss": 0.23350396156311035, + "memory(GiB)": 70.62, + "step": 235, + "token_acc": 0.9229762656799928, + "train_speed(iter/s)": 0.146688 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.5109623670578003, + "learning_rate": 1.1456128564660273e-07, + "loss": 0.18792340755462647, + "memory(GiB)": 70.62, + "step": 240, + "token_acc": 0.9343817012151536, + "train_speed(iter/s)": 0.146335 + }, + { + "epoch": 4.808080808080808, + "eval_loss": 0.38130462169647217, + "eval_runtime": 0.9483, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 4.218, + "eval_token_acc": 0.7470472440944882, + "step": 240 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.5497174263000488, + "learning_rate": 0.0, + "loss": 0.22356274127960205, + "memory(GiB)": 70.62, + "step": 245, + "token_acc": 0.9154690412093601, + "train_speed(iter/s)": 0.14648 + }, + { + "epoch": 4.909090909090909, + "eval_loss": 0.38277411460876465, + "eval_runtime": 0.9362, + "eval_samples_per_second": 4.273, + "eval_steps_per_second": 4.273, + "eval_token_acc": 0.7519685039370079, + "step": 245 + } + ], + "logging_steps": 5, + "max_steps": 245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.271899005323305e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/training_args.bin b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a48f0b37bb73726c0ffea803b2ec9c9843f83f0 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4bc0a4e65f33549a5e111ad696a52e610f49087c4f1069f7c9bd8d1807492d +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..f3af3495071193cfc98dc4f6223184a2a2dc3bf5 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..0edff345f14f85747a6a94b7c8f24a0befb9c988 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..11d58475ee438b5a4801348d0b86eed7eb2a14dc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..c433563904c211705f98107bad990b8619b98068 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..90f65c0c0421f7cb92d2573397b1dee0724f91ae Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..9cc2f5018175ad195aab7254264c4c8fdcaa870b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..5354f308cfa6a06a407efb94b38b696fc4542a27 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..1a747bb27a6bf92800a1028c0eaaaa4305471d7f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1bc3c72d801dfd41b1f033d8f06c9d41364c38d6 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..6851df64acaab5981b128c86ec13bf11cbb30854 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..30fef5cc0e8e5e9c4cb002fc097585ab5e86d162 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..b564b3c8ed4e50abedce9dcce419a460ee599960 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..59f9b7b87dafe35c5cbec3fc9f856291648276a0 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..207db6a1b2afd0f0daaeba18b89d7feeb4bc3707 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..da4e1041150ed026e5fe9f869a351481fe29a9e6 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..6e15deddc3f430b0e0092e6b29e182db4734afac Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..0f380b3f6ace910fc11e742e477b703835958537 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/logging.jsonl b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..221a2fd22923decf2f8a7c36fd9aa1b38fcd8be4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/logging.jsonl @@ -0,0 +1,65 @@ +{"loss": 0.60098779, "token_acc": 0.8543944, "grad_norm": 0.21148299, "learning_rate": 7.69e-06, "memory(GiB)": 38.38, "train_speed(iter/s)": 0.028445, "epoch": 0.02020202, "global_step/max_steps": "1/245", "percentage": "0.41%", "elapsed_time": "34s", "remaining_time": "2h 21m 44s"} +{"loss": 0.91196889, "token_acc": 0.78892274, "grad_norm": 0.56911075, "learning_rate": 3.846e-05, "memory(GiB)": 48.25, "train_speed(iter/s)": 0.08115, "epoch": 0.1010101, "global_step/max_steps": "5/245", "percentage": "2.04%", "elapsed_time": "1m 1s", "remaining_time": "49m 3s"} +{"loss": 0.73592548, "token_acc": 0.81155175, "grad_norm": 0.27697206, "learning_rate": 7.692e-05, "memory(GiB)": 54.71, "train_speed(iter/s)": 0.106586, "epoch": 0.2020202, "global_step/max_steps": "10/245", "percentage": "4.08%", "elapsed_time": "1m 33s", "remaining_time": "36m 37s"} +{"loss": 0.57703838, "token_acc": 0.83466887, "grad_norm": 0.48245591, "learning_rate": 9.998e-05, "memory(GiB)": 61.19, "train_speed(iter/s)": 0.117053, "epoch": 0.3030303, "global_step/max_steps": "15/245", "percentage": "6.12%", "elapsed_time": "2m 7s", "remaining_time": "32m 40s"} +{"loss": 0.54890819, "token_acc": 0.83068883, "grad_norm": 0.17850158, "learning_rate": 9.978e-05, "memory(GiB)": 61.19, "train_speed(iter/s)": 0.124147, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "2m 40s", "remaining_time": "30m 9s"} +{"eval_loss": 0.66024798, "eval_token_acc": 0.72637795, "eval_runtime": 0.8576, "eval_samples_per_second": 4.664, "eval_steps_per_second": 4.664, "epoch": 0.4040404, "global_step/max_steps": "20/245", "percentage": "8.16%", "elapsed_time": "2m 41s", "remaining_time": "30m 18s"} +{"loss": 0.51028366, "token_acc": 0.83558921, "grad_norm": 0.12782401, "learning_rate": 9.934e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.126266, "epoch": 0.50505051, "global_step/max_steps": "25/245", "percentage": "10.20%", "elapsed_time": "3m 17s", "remaining_time": "28m 59s"} +{"loss": 0.5704392, "token_acc": 0.82098515, "grad_norm": 0.15201674, "learning_rate": 9.868e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.129292, "epoch": 0.60606061, "global_step/max_steps": "30/245", "percentage": "12.24%", "elapsed_time": "3m 51s", "remaining_time": "27m 40s"} +{"loss": 0.53195105, "token_acc": 0.83110281, "grad_norm": 0.1464695, "learning_rate": 9.78e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.131949, "epoch": 0.70707071, "global_step/max_steps": "35/245", "percentage": "14.29%", "elapsed_time": "4m 24s", "remaining_time": "26m 29s"} +{"loss": 0.42771115, "token_acc": 0.86062682, "grad_norm": 0.14991415, "learning_rate": 9.67e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.133725, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "4m 58s", "remaining_time": "25m 31s"} +{"eval_loss": 0.54624629, "eval_token_acc": 0.73622047, "eval_runtime": 0.8819, "eval_samples_per_second": 4.536, "eval_steps_per_second": 4.536, "epoch": 0.80808081, "global_step/max_steps": "40/245", "percentage": "16.33%", "elapsed_time": "4m 59s", "remaining_time": "25m 36s"} +{"loss": 0.4666142, "token_acc": 0.84145014, "grad_norm": 0.13645765, "learning_rate": 9.538e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.134892, "epoch": 0.90909091, "global_step/max_steps": "45/245", "percentage": "18.37%", "elapsed_time": "5m 33s", "remaining_time": "24m 41s"} +{"loss": 0.59555979, "token_acc": 0.81287763, "grad_norm": 0.11755682, "learning_rate": 9.385e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.13723, "epoch": 1.0, "global_step/max_steps": "50/245", "percentage": "20.41%", "elapsed_time": "6m 4s", "remaining_time": "23m 39s"} +{"loss": 0.50692964, "token_acc": 0.83502349, "grad_norm": 0.12964965, "learning_rate": 9.213e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.137114, "epoch": 1.1010101, "global_step/max_steps": "55/245", "percentage": "22.45%", "elapsed_time": "6m 40s", "remaining_time": "23m 4s"} +{"loss": 0.4487432, "token_acc": 0.84580731, "grad_norm": 0.15529637, "learning_rate": 9.021e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.137926, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "7m 14s", "remaining_time": "22m 20s"} +{"eval_loss": 0.54119068, "eval_token_acc": 0.73622047, "eval_runtime": 1.0083, "eval_samples_per_second": 3.967, "eval_steps_per_second": 3.967, "epoch": 1.2020202, "global_step/max_steps": "60/245", "percentage": "24.49%", "elapsed_time": "7m 15s", "remaining_time": "22m 23s"} +{"loss": 0.51941571, "token_acc": 0.82924755, "grad_norm": 0.1704621, "learning_rate": 8.811e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.138628, "epoch": 1.3030303, "global_step/max_steps": "65/245", "percentage": "26.53%", "elapsed_time": "7m 48s", "remaining_time": "21m 37s"} +{"loss": 0.37682528, "token_acc": 0.87657035, "grad_norm": 0.1591294, "learning_rate": 8.583e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.138973, "epoch": 1.4040404, "global_step/max_steps": "70/245", "percentage": "28.57%", "elapsed_time": "8m 23s", "remaining_time": "20m 58s"} +{"loss": 0.48751307, "token_acc": 0.84348957, "grad_norm": 0.16839051, "learning_rate": 8.339e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.140411, "epoch": 1.50505051, "global_step/max_steps": "75/245", "percentage": "30.61%", "elapsed_time": "8m 53s", "remaining_time": "20m 10s"} +{"loss": 0.45658741, "token_acc": 0.85341727, "grad_norm": 0.3218179, "learning_rate": 8.079e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141042, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "9m 26s", "remaining_time": "19m 29s"} +{"eval_loss": 0.48974395, "eval_token_acc": 0.74212598, "eval_runtime": 0.956, "eval_samples_per_second": 4.184, "eval_steps_per_second": 4.184, "epoch": 1.60606061, "global_step/max_steps": "80/245", "percentage": "32.65%", "elapsed_time": "9m 27s", "remaining_time": "19m 31s"} +{"loss": 0.43548365, "token_acc": 0.85262621, "grad_norm": 0.17291015, "learning_rate": 7.806e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141128, "epoch": 1.70707071, "global_step/max_steps": "85/245", "percentage": "34.69%", "elapsed_time": "10m 1s", "remaining_time": "18m 53s"} +{"loss": 0.42808313, "token_acc": 0.85529082, "grad_norm": 0.20456976, "learning_rate": 7.52e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141744, "epoch": 1.80808081, "global_step/max_steps": "90/245", "percentage": "36.73%", "elapsed_time": "10m 34s", "remaining_time": "18m 13s"} +{"loss": 0.44958682, "token_acc": 0.84968616, "grad_norm": 0.17617679, "learning_rate": 7.222e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.142084, "epoch": 1.90909091, "global_step/max_steps": "95/245", "percentage": "38.78%", "elapsed_time": "11m 8s", "remaining_time": "17m 35s"} +{"loss": 0.42734194, "token_acc": 0.87120433, "grad_norm": 0.31471038, "learning_rate": 6.913e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143097, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "11m 38s", "remaining_time": "16m 52s"} +{"eval_loss": 0.43999299, "eval_token_acc": 0.7480315, "eval_runtime": 0.9538, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "epoch": 2.0, "global_step/max_steps": "100/245", "percentage": "40.82%", "elapsed_time": "11m 39s", "remaining_time": "16m 54s"} +{"loss": 0.35692894, "token_acc": 0.87215064, "grad_norm": 0.25447351, "learning_rate": 6.597e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143239, "epoch": 2.1010101, "global_step/max_steps": "105/245", "percentage": "42.86%", "elapsed_time": "12m 12s", "remaining_time": "16m 16s"} +{"loss": 0.3037957, "token_acc": 0.90143165, "grad_norm": 0.17970112, "learning_rate": 6.272e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143197, "epoch": 2.2020202, "global_step/max_steps": "110/245", "percentage": "44.90%", "elapsed_time": "12m 47s", "remaining_time": "15m 42s"} +{"loss": 0.36233838, "token_acc": 0.88071698, "grad_norm": 0.24640092, "learning_rate": 5.942e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143856, "epoch": 2.3030303, "global_step/max_steps": "115/245", "percentage": "46.94%", "elapsed_time": "13m 19s", "remaining_time": "15m 3s"} +{"loss": 0.33151195, "token_acc": 0.89216919, "grad_norm": 0.42206675, "learning_rate": 5.608e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144277, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "13m 51s", "remaining_time": "14m 26s"} +{"eval_loss": 0.42539883, "eval_token_acc": 0.74704724, "eval_runtime": 0.8511, "eval_samples_per_second": 4.7, "eval_steps_per_second": 4.7, "epoch": 2.4040404, "global_step/max_steps": "120/245", "percentage": "48.98%", "elapsed_time": "13m 52s", "remaining_time": "14m 26s"} +{"loss": 0.34746339, "token_acc": 0.87323548, "grad_norm": 0.31316894, "learning_rate": 5.271e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144095, "epoch": 2.50505051, "global_step/max_steps": "125/245", "percentage": "51.02%", "elapsed_time": "14m 27s", "remaining_time": "13m 52s"} +{"loss": 0.38753667, "token_acc": 0.86960012, "grad_norm": 0.29188049, "learning_rate": 4.932e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144472, "epoch": 2.60606061, "global_step/max_steps": "130/245", "percentage": "53.06%", "elapsed_time": "14m 59s", "remaining_time": "13m 15s"} +{"loss": 0.33181291, "token_acc": 0.88866977, "grad_norm": 0.33309916, "learning_rate": 4.594e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144381, "epoch": 2.70707071, "global_step/max_steps": "135/245", "percentage": "55.10%", "elapsed_time": "15m 34s", "remaining_time": "12m 41s"} +{"loss": 0.38605783, "token_acc": 0.8668199, "grad_norm": 0.31272197, "learning_rate": 4.258e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144688, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "16m 7s", "remaining_time": "12m 5s"} +{"eval_loss": 0.43271846, "eval_token_acc": 0.74704724, "eval_runtime": 0.8971, "eval_samples_per_second": 4.459, "eval_steps_per_second": 4.459, "epoch": 2.80808081, "global_step/max_steps": "140/245", "percentage": "57.14%", "elapsed_time": "16m 8s", "remaining_time": "12m 6s"} +{"loss": 0.40136557, "token_acc": 0.86386096, "grad_norm": 0.35478416, "learning_rate": 3.925e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144401, "epoch": 2.90909091, "global_step/max_steps": "145/245", "percentage": "59.18%", "elapsed_time": "16m 43s", "remaining_time": "11m 32s"} +{"loss": 0.42832589, "token_acc": 0.86489297, "grad_norm": 0.57609987, "learning_rate": 3.597e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144976, "epoch": 3.0, "global_step/max_steps": "150/245", "percentage": "61.22%", "elapsed_time": "17m 14s", "remaining_time": "10m 55s"} +{"loss": 0.28979499, "token_acc": 0.90824999, "grad_norm": 0.42456827, "learning_rate": 3.276e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145038, "epoch": 3.1010101, "global_step/max_steps": "155/245", "percentage": "63.27%", "elapsed_time": "17m 48s", "remaining_time": "10m 20s"} +{"loss": 0.25866199, "token_acc": 0.91318609, "grad_norm": 0.29301503, "learning_rate": 2.962e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145129, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "18m 22s", "remaining_time": "9m 45s"} +{"eval_loss": 0.40826285, "eval_token_acc": 0.74901575, "eval_runtime": 0.8819, "eval_samples_per_second": 4.535, "eval_steps_per_second": 4.535, "epoch": 3.2020202, "global_step/max_steps": "160/245", "percentage": "65.31%", "elapsed_time": "18m 23s", "remaining_time": "9m 45s"} +{"loss": 0.27140424, "token_acc": 0.90902805, "grad_norm": 0.53962678, "learning_rate": 2.658e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144897, "epoch": 3.3030303, "global_step/max_steps": "165/245", "percentage": "67.35%", "elapsed_time": "18m 58s", "remaining_time": "9m 11s"} +{"loss": 0.27426331, "token_acc": 0.90530946, "grad_norm": 0.48307002, "learning_rate": 2.364e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145211, "epoch": 3.4040404, "global_step/max_steps": "170/245", "percentage": "69.39%", "elapsed_time": "19m 30s", "remaining_time": "8m 36s"} +{"loss": 0.27257814, "token_acc": 0.91062689, "grad_norm": 0.48951933, "learning_rate": 2.083e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145332, "epoch": 3.50505051, "global_step/max_steps": "175/245", "percentage": "71.43%", "elapsed_time": "20m 3s", "remaining_time": "8m 1s"} +{"loss": 0.21682074, "token_acc": 0.9202954, "grad_norm": 0.54180127, "learning_rate": 1.815e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145793, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "20m 34s", "remaining_time": "7m 25s"} +{"eval_loss": 0.40150905, "eval_token_acc": 0.74507874, "eval_runtime": 0.9846, "eval_samples_per_second": 4.063, "eval_steps_per_second": 4.063, "epoch": 3.60606061, "global_step/max_steps": "180/245", "percentage": "73.47%", "elapsed_time": "20m 35s", "remaining_time": "7m 26s"} +{"loss": 0.24239688, "token_acc": 0.9123016, "grad_norm": 0.44234234, "learning_rate": 1.562e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14543, "epoch": 3.70707071, "global_step/max_steps": "185/245", "percentage": "75.51%", "elapsed_time": "21m 11s", "remaining_time": "6m 52s"} +{"loss": 0.28752489, "token_acc": 0.89578641, "grad_norm": 0.28298733, "learning_rate": 1.324e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145672, "epoch": 3.80808081, "global_step/max_steps": "190/245", "percentage": "77.55%", "elapsed_time": "21m 44s", "remaining_time": "6m 17s"} +{"loss": 0.23876879, "token_acc": 0.91869824, "grad_norm": 0.57574016, "learning_rate": 1.103e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14583, "epoch": 3.90909091, "global_step/max_steps": "195/245", "percentage": "79.59%", "elapsed_time": "22m 16s", "remaining_time": "5m 42s"} +{"loss": 0.27677224, "token_acc": 0.90560175, "grad_norm": 0.66900015, "learning_rate": 9e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146225, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "22m 47s", "remaining_time": "5m 7s"} +{"eval_loss": 0.38474351, "eval_token_acc": 0.7480315, "eval_runtime": 0.8708, "eval_samples_per_second": 4.593, "eval_steps_per_second": 4.593, "epoch": 4.0, "global_step/max_steps": "200/245", "percentage": "81.63%", "elapsed_time": "22m 48s", "remaining_time": "5m 7s"} +{"loss": 0.16934741, "token_acc": 0.93425872, "grad_norm": 0.65038949, "learning_rate": 7.16e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14617, "epoch": 4.1010101, "global_step/max_steps": "205/245", "percentage": "83.67%", "elapsed_time": "23m 22s", "remaining_time": "4m 33s"} +{"loss": 0.24090688, "token_acc": 0.91701407, "grad_norm": 0.41981319, "learning_rate": 5.51e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146031, "epoch": 4.2020202, "global_step/max_steps": "210/245", "percentage": "85.71%", "elapsed_time": "23m 57s", "remaining_time": "3m 59s"} +{"loss": 0.25371742, "token_acc": 0.91763322, "grad_norm": 0.37690315, "learning_rate": 4.07e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.1459, "epoch": 4.3030303, "global_step/max_steps": "215/245", "percentage": "87.76%", "elapsed_time": "24m 33s", "remaining_time": "3m 25s"} +{"loss": 0.23050859, "token_acc": 0.92620002, "grad_norm": 0.47662193, "learning_rate": 2.84e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146025, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "25m 6s", "remaining_time": "2m 51s"} +{"eval_loss": 0.38504443, "eval_token_acc": 0.7480315, "eval_runtime": 0.9782, "eval_samples_per_second": 4.089, "eval_steps_per_second": 4.089, "epoch": 4.4040404, "global_step/max_steps": "220/245", "percentage": "89.80%", "elapsed_time": "25m 7s", "remaining_time": "2m 51s"} +{"loss": 0.1852495, "token_acc": 0.93200993, "grad_norm": 0.48112935, "learning_rate": 1.82e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146041, "epoch": 4.50505051, "global_step/max_steps": "225/245", "percentage": "91.84%", "elapsed_time": "25m 40s", "remaining_time": "2m 16s"} +{"loss": 0.17858517, "token_acc": 0.9379442, "grad_norm": 0.54408669, "learning_rate": 1.03e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146314, "epoch": 4.60606061, "global_step/max_steps": "230/245", "percentage": "93.88%", "elapsed_time": "26m 11s", "remaining_time": "1m 42s"} +{"loss": 0.23350396, "token_acc": 0.92297627, "grad_norm": 0.40885946, "learning_rate": 4.6e-07, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146688, "epoch": 4.70707071, "global_step/max_steps": "235/245", "percentage": "95.92%", "elapsed_time": "26m 41s", "remaining_time": "1m 8s"} +{"loss": 0.18792341, "token_acc": 0.9343817, "grad_norm": 0.51096237, "learning_rate": 1.1e-07, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146335, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "27m 19s", "remaining_time": "34s"} +{"eval_loss": 0.38130462, "eval_token_acc": 0.74704724, "eval_runtime": 0.9483, "eval_samples_per_second": 4.218, "eval_steps_per_second": 4.218, "epoch": 4.80808081, "global_step/max_steps": "240/245", "percentage": "97.96%", "elapsed_time": "27m 20s", "remaining_time": "34s"} +{"loss": 0.22356274, "token_acc": 0.91546904, "grad_norm": 0.54971743, "learning_rate": 0.0, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14648, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 52s", "remaining_time": "0s"} +{"eval_loss": 0.38277411, "eval_token_acc": 0.7519685, "eval_runtime": 0.9362, "eval_samples_per_second": 4.273, "eval_steps_per_second": 4.273, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 53s", "remaining_time": "0s"} +{"train_runtime": 1674.3199, "train_samples_per_second": 1.183, "train_steps_per_second": 0.146, "total_flos": 1.271899005323305e+17, "train_loss": 0.37796208, "epoch": 4.90909091, "global_step/max_steps": "245/245", "percentage": "100.00%", "elapsed_time": "27m 54s", "remaining_time": "0s"} +{"train_dataset": "775.398990±644.578527, min=41.000000, max=4149.000000, size=396", "val_dataset": "311.500000±316.897854, min=85.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 14838.8465M Params (68.8128M Trainable [0.4637%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-245", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/checkpoint-240", "best_metric": 0.38130462, "global_step": 245, "log_history": [{"loss": 0.6009877920150757, "token_acc": 0.8543944031482291, "grad_norm": 0.21148298680782318, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 38.38, "train_speed(iter/s)": 0.028445, "epoch": 0.020202020202020204, "step": 1}, {"loss": 0.9119688868522644, "token_acc": 0.7889227360841872, "grad_norm": 0.5691107511520386, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 48.25, "train_speed(iter/s)": 0.08115, "epoch": 0.10101010101010101, "step": 5}, {"loss": 0.7359254837036133, "token_acc": 0.8115517500375544, "grad_norm": 0.27697205543518066, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 54.71, "train_speed(iter/s)": 0.106586, "epoch": 0.20202020202020202, "step": 10}, {"loss": 0.5770383834838867, "token_acc": 0.8346688741721854, "grad_norm": 0.48245590925216675, "learning_rate": 9.99816643111642e-05, "memory(GiB)": 61.19, "train_speed(iter/s)": 0.117053, "epoch": 0.30303030303030304, "step": 15}, {"loss": 0.5489081859588623, "token_acc": 0.8306888325065324, "grad_norm": 0.17850157618522644, "learning_rate": 9.977554222133292e-05, "memory(GiB)": 61.19, "train_speed(iter/s)": 0.124147, "epoch": 0.40404040404040403, "step": 20}, {"eval_loss": 0.6602479815483093, "eval_token_acc": 0.7263779527559056, "eval_runtime": 0.8576, "eval_samples_per_second": 4.664, "eval_steps_per_second": 4.664, "epoch": 0.40404040404040403, "step": 20}, {"loss": 0.5102836608886718, "token_acc": 0.8355892096545197, "grad_norm": 0.12782400846481323, "learning_rate": 9.934132612707632e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.126266, "epoch": 0.5050505050505051, "step": 25}, {"loss": 0.5704391956329345, "token_acc": 0.8209851527097226, "grad_norm": 0.152016744017601, "learning_rate": 9.868100580255466e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.129292, "epoch": 0.6060606060606061, "step": 30}, {"loss": 0.5319510459899902, "token_acc": 0.8311028114949683, "grad_norm": 0.14646950364112854, "learning_rate": 9.779760713358059e-05, "memory(GiB)": 70.6, "train_speed(iter/s)": 0.131949, "epoch": 0.7070707070707071, "step": 35}, {"loss": 0.42771115303039553, "token_acc": 0.8606268231463265, "grad_norm": 0.14991414546966553, "learning_rate": 9.669517825164434e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.133725, "epoch": 0.8080808080808081, "step": 40}, {"eval_loss": 0.5462462902069092, "eval_token_acc": 0.7362204724409449, "eval_runtime": 0.8819, "eval_samples_per_second": 4.536, "eval_steps_per_second": 4.536, "epoch": 0.8080808080808081, "step": 40}, {"loss": 0.4666141986846924, "token_acc": 0.8414501379010016, "grad_norm": 0.1364576518535614, "learning_rate": 9.537877098354786e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.134892, "epoch": 0.9090909090909091, "step": 45}, {"loss": 0.5955597877502441, "token_acc": 0.8128776297590449, "grad_norm": 0.11755681782960892, "learning_rate": 9.385441770165385e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.13723, "epoch": 1.0, "step": 50}, {"loss": 0.5069296360015869, "token_acc": 0.8350234910528613, "grad_norm": 0.12964965403079987, "learning_rate": 9.212910368083245e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.137114, "epoch": 1.101010101010101, "step": 55}, {"loss": 0.448743200302124, "token_acc": 0.8458073103307054, "grad_norm": 0.15529637038707733, "learning_rate": 9.021073508877845e-05, "memory(GiB)": 70.61, "train_speed(iter/s)": 0.137926, "epoch": 1.202020202020202, "step": 60}, {"eval_loss": 0.5411906838417053, "eval_token_acc": 0.7362204724409449, "eval_runtime": 1.0083, "eval_samples_per_second": 3.967, "eval_steps_per_second": 3.967, "epoch": 1.202020202020202, "step": 60}, {"loss": 0.5194157123565674, "token_acc": 0.829247550378998, "grad_norm": 0.17046210169792175, "learning_rate": 8.810810275638183e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.138628, "epoch": 1.303030303030303, "step": 65}, {"loss": 0.37682528495788575, "token_acc": 0.8765703517587939, "grad_norm": 0.15912939608097076, "learning_rate": 8.583084189417224e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.138973, "epoch": 1.404040404040404, "step": 70}, {"loss": 0.48751306533813477, "token_acc": 0.8434895707885737, "grad_norm": 0.16839051246643066, "learning_rate": 8.338938793943478e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.140411, "epoch": 1.5050505050505052, "step": 75}, {"loss": 0.4565874099731445, "token_acc": 0.8534172661870504, "grad_norm": 0.32181790471076965, "learning_rate": 8.079492873632554e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141042, "epoch": 1.606060606060606, "step": 80}, {"eval_loss": 0.4897439479827881, "eval_token_acc": 0.7421259842519685, "eval_runtime": 0.956, "eval_samples_per_second": 4.184, "eval_steps_per_second": 4.184, "epoch": 1.606060606060606, "step": 80}, {"loss": 0.4354836463928223, "token_acc": 0.8526262111167772, "grad_norm": 0.1729101538658142, "learning_rate": 7.805935326811912e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141128, "epoch": 1.7070707070707072, "step": 85}, {"loss": 0.4280831336975098, "token_acc": 0.855290819901892, "grad_norm": 0.2045697569847107, "learning_rate": 7.519519717652039e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.141744, "epoch": 1.808080808080808, "step": 90}, {"loss": 0.449586820602417, "token_acc": 0.8496861579121242, "grad_norm": 0.1761767864227295, "learning_rate": 7.221558531769519e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.142084, "epoch": 1.9090909090909092, "step": 95}, {"loss": 0.42734193801879883, "token_acc": 0.8712043289656097, "grad_norm": 0.3147103786468506, "learning_rate": 6.91341716182545e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143097, "epoch": 2.0, "step": 100}, {"eval_loss": 0.4399929940700531, "eval_token_acc": 0.7480314960629921, "eval_runtime": 0.9538, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "epoch": 2.0, "step": 100}, {"loss": 0.3569289445877075, "token_acc": 0.8721506442021804, "grad_norm": 0.2544735074043274, "learning_rate": 6.5965076506799e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143239, "epoch": 2.101010101010101, "step": 105}, {"loss": 0.3037956953048706, "token_acc": 0.9014316482959585, "grad_norm": 0.17970111966133118, "learning_rate": 6.272282220774091e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143197, "epoch": 2.202020202020202, "step": 110}, {"loss": 0.36233837604522706, "token_acc": 0.88071698416526, "grad_norm": 0.24640092253684998, "learning_rate": 5.9422266193915924e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.143856, "epoch": 2.303030303030303, "step": 115}, {"loss": 0.33151195049285886, "token_acc": 0.8921691877205077, "grad_norm": 0.42206674814224243, "learning_rate": 5.6078533102935745e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144277, "epoch": 2.404040404040404, "step": 120}, {"eval_loss": 0.4253988265991211, "eval_token_acc": 0.7470472440944882, "eval_runtime": 0.8511, "eval_samples_per_second": 4.7, "eval_steps_per_second": 4.7, "epoch": 2.404040404040404, "step": 120}, {"loss": 0.34746339321136477, "token_acc": 0.8732354800196643, "grad_norm": 0.3131689429283142, "learning_rate": 5.270694542927088e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144095, "epoch": 2.505050505050505, "step": 125}, {"loss": 0.38753666877746584, "token_acc": 0.8696001223335117, "grad_norm": 0.2918804883956909, "learning_rate": 4.9322953309663916e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144472, "epoch": 2.606060606060606, "step": 130}, {"loss": 0.3318129062652588, "token_acc": 0.8886697685884202, "grad_norm": 0.3330991566181183, "learning_rate": 4.594206372362845e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144381, "epoch": 2.707070707070707, "step": 135}, {"loss": 0.38605782985687254, "token_acc": 0.8668198982104744, "grad_norm": 0.31272196769714355, "learning_rate": 4.2579769433468694e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144688, "epoch": 2.808080808080808, "step": 140}, {"eval_loss": 0.4327184557914734, "eval_token_acc": 0.7470472440944882, "eval_runtime": 0.8971, "eval_samples_per_second": 4.459, "eval_steps_per_second": 4.459, "epoch": 2.808080808080808, "step": 140}, {"loss": 0.4013655662536621, "token_acc": 0.8638609643891634, "grad_norm": 0.35478416085243225, "learning_rate": 3.92514779894488e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144401, "epoch": 2.909090909090909, "step": 145}, {"loss": 0.428325891494751, "token_acc": 0.8648929716613225, "grad_norm": 0.5760998725891113, "learning_rate": 3.597244112544208e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144976, "epoch": 3.0, "step": 150}, {"loss": 0.28979499340057374, "token_acc": 0.908249991466703, "grad_norm": 0.4245682656764984, "learning_rate": 3.275768486860149e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145038, "epoch": 3.101010101010101, "step": 155}, {"loss": 0.25866198539733887, "token_acc": 0.9131860922807785, "grad_norm": 0.2930150330066681, "learning_rate": 2.962194068331996e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145129, "epoch": 3.202020202020202, "step": 160}, {"eval_loss": 0.40826284885406494, "eval_token_acc": 0.7490157480314961, "eval_runtime": 0.8819, "eval_samples_per_second": 4.535, "eval_steps_per_second": 4.535, "epoch": 3.202020202020202, "step": 160}, {"loss": 0.27140424251556394, "token_acc": 0.9090280491837778, "grad_norm": 0.5396267771720886, "learning_rate": 2.65795779650105e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.144897, "epoch": 3.303030303030303, "step": 165}, {"loss": 0.27426331043243407, "token_acc": 0.905309461336626, "grad_norm": 0.4830700159072876, "learning_rate": 2.3644538193049625e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145211, "epoch": 3.404040404040404, "step": 170}, {"loss": 0.27257814407348635, "token_acc": 0.9106268856855515, "grad_norm": 0.489519327878952, "learning_rate": 2.08302710446253e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145332, "epoch": 3.505050505050505, "step": 175}, {"loss": 0.21682074069976806, "token_acc": 0.9202954023506101, "grad_norm": 0.5418012738227844, "learning_rate": 1.8149672762244624e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145793, "epoch": 3.606060606060606, "step": 180}, {"eval_loss": 0.40150904655456543, "eval_token_acc": 0.7450787401574803, "eval_runtime": 0.9846, "eval_samples_per_second": 4.063, "eval_steps_per_second": 4.063, "epoch": 3.606060606060606, "step": 180}, {"loss": 0.24239687919616698, "token_acc": 0.9123015991185491, "grad_norm": 0.4423423409461975, "learning_rate": 1.561502705732883e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14543, "epoch": 3.707070707070707, "step": 185}, {"loss": 0.2875248908996582, "token_acc": 0.8957864133327873, "grad_norm": 0.2829873263835907, "learning_rate": 1.3237948820702495e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.145672, "epoch": 3.808080808080808, "step": 190}, {"loss": 0.2387687921524048, "token_acc": 0.9186982386562463, "grad_norm": 0.5757401585578918, "learning_rate": 1.102933089792042e-05, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14583, "epoch": 3.909090909090909, "step": 195}, {"loss": 0.2767722368240356, "token_acc": 0.9056017545462853, "grad_norm": 0.6690001487731934, "learning_rate": 8.999294173332058e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146225, "epoch": 4.0, "step": 200}, {"eval_loss": 0.38474351167678833, "eval_token_acc": 0.7480314960629921, "eval_runtime": 0.8708, "eval_samples_per_second": 4.593, "eval_steps_per_second": 4.593, "epoch": 4.0, "step": 200}, {"loss": 0.16934740543365479, "token_acc": 0.9342587237324079, "grad_norm": 0.6503894925117493, "learning_rate": 7.157141191620548e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14617, "epoch": 4.101010101010101, "step": 205}, {"loss": 0.24090688228607177, "token_acc": 0.9170140698280355, "grad_norm": 0.4198131859302521, "learning_rate": 5.5113135293435815e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146031, "epoch": 4.202020202020202, "step": 210}, {"loss": 0.25371742248535156, "token_acc": 0.9176332202647992, "grad_norm": 0.37690314650535583, "learning_rate": 4.069353111818913e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.1459, "epoch": 4.303030303030303, "step": 215}, {"loss": 0.23050858974456787, "token_acc": 0.9262000205570974, "grad_norm": 0.47662192583084106, "learning_rate": 2.8378676526178482e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146025, "epoch": 4.404040404040404, "step": 220}, {"eval_loss": 0.3850444257259369, "eval_token_acc": 0.7480314960629921, "eval_runtime": 0.9782, "eval_samples_per_second": 4.089, "eval_steps_per_second": 4.089, "epoch": 4.404040404040404, "step": 220}, {"loss": 0.1852494955062866, "token_acc": 0.9320099255583126, "grad_norm": 0.48112934827804565, "learning_rate": 1.8225003740388547e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146041, "epoch": 4.505050505050505, "step": 225}, {"loss": 0.17858517169952393, "token_acc": 0.9379441985135849, "grad_norm": 0.5440866947174072, "learning_rate": 1.0279041473154116e-06, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146314, "epoch": 4.606060606060606, "step": 230}, {"loss": 0.23350396156311035, "token_acc": 0.9229762656799928, "grad_norm": 0.4088594615459442, "learning_rate": 4.577201710596612e-07, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146688, "epoch": 4.707070707070707, "step": 235}, {"loss": 0.18792340755462647, "token_acc": 0.9343817012151536, "grad_norm": 0.5109623670578003, "learning_rate": 1.1456128564660273e-07, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.146335, "epoch": 4.808080808080808, "step": 240}, {"eval_loss": 0.38130462169647217, "eval_token_acc": 0.7470472440944882, "eval_runtime": 0.9483, "eval_samples_per_second": 4.218, "eval_steps_per_second": 4.218, "epoch": 4.808080808080808, "step": 240}, {"loss": 0.22356274127960205, "token_acc": 0.9154690412093601, "grad_norm": 0.5497174263000488, "learning_rate": 0.0, "memory(GiB)": 70.62, "train_speed(iter/s)": 0.14648, "epoch": 4.909090909090909, "step": 245}, {"eval_loss": 0.38277411460876465, "eval_token_acc": 0.7519685039370079, "eval_runtime": 0.9362, "eval_samples_per_second": 4.273, "eval_steps_per_second": 4.273, "epoch": 4.909090909090909, "step": 245}, {"train_runtime": 1674.3199, "train_samples_per_second": 1.183, "train_steps_per_second": 0.146, "total_flos": 1.271899005323305e+17, "train_loss": 0.37796208420578314, "epoch": 4.909090909090909, "step": 245}], "memory": 70.6171875} diff --git a/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs/events.out.tfevents.1737762389.kml-dtmachine-18088-prod.70834.0 b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs/events.out.tfevents.1737762389.kml-dtmachine-18088-prod.70834.0 new file mode 100644 index 0000000000000000000000000000000000000000..386012ec8c379e033c893bb5752980b4c3770477 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-14b_400_0.5_sft_8192_rank16_epoch5_what/v0-20250124-234442/runs/events.out.tfevents.1737762389.kml-dtmachine-18088-prod.70834.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0b27a514a729d52a2af7be4dcca8c6622d55ac7fdbe363bb2a8cc8929179316 +size 29516 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-142659/runs/events.out.tfevents.1737728909.kml-task-547024-record-9965643-prod-worker-0.36995.0 b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-142659/runs/events.out.tfevents.1737728909.kml-task-547024-record-9965643-prod-worker-0.36995.0 new file mode 100644 index 0000000000000000000000000000000000000000..f401fb5e955e4d9ca2b8fbbf024492315907c3d2 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-142659/runs/events.out.tfevents.1737728909.kml-task-547024-record-9965643-prod-worker-0.36995.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09f067b7f6ff9930f53da3a8fffc2df65dfc92f060a8fec5e2218be269389146 +size 5932 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/args.json new file mode 100644 index 0000000000000000000000000000000000000000..8f99135ca18d6f9dbe39c51b5857595c8ee2bce9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/README.md b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a60675872493fa836badef060116183aeb9785a8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "q_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d77b28dd2a14aa66ed19933c3dac21e65b91ddaa --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f53a4e4995d3ed2246c5ed0f887287062b8b55962c90c53a4948f638561021b +size 536991984 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/additional_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/args.json new file mode 100644 index 0000000000000000000000000000000000000000..8f99135ca18d6f9dbe39c51b5857595c8ee2bce9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/optimizer.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c16ed12ddbdbf81ffab247c135bc4aa9ce2e41a --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da9866cd3302ba206a88f5191b0f3989a302833a81e9463144c59b86124ae9b +size 1074499986 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/rng_state.pth b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd2fe7cd9d6c51770a6ae72173257988f8754bad --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b852fc307d98a6a0a6aaab0bf8f844edc73238dcca34418827366ca2993f3b2d +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/scheduler.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d2212ebbbb10dacbe1a1606284dbe00e474558f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ea7807968f14a7e6aa3199fa8036f47c82ff9fdd95652fbfa9a27983793616 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/trainer_state.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e01bada1ae577663b85ed1e9abff6dc05d82e6d6 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/trainer_state.json @@ -0,0 +1,1513 @@ +{ + "best_metric": 0.30427584, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600", + "epoch": 3.0303030303030303, + "eval_steps": 20, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.15279237926006317, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.594012975692749, + "memory(GiB)": 71.9, + "step": 1, + "token_acc": 0.8394495412844036, + "train_speed(iter/s)": 0.211078 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.3137172758579254, + "learning_rate": 1e-05, + "loss": 0.7095059156417847, + "memory(GiB)": 81.34, + "step": 5, + "token_acc": 0.829162656400385, + "train_speed(iter/s)": 0.342487 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.1773456335067749, + "learning_rate": 2e-05, + "loss": 0.7598193645477295, + "memory(GiB)": 87.85, + "step": 10, + "token_acc": 0.7958339958657974, + "train_speed(iter/s)": 0.368643 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.19639591872692108, + "learning_rate": 3e-05, + "loss": 0.7279319763183594, + "memory(GiB)": 98.3, + "step": 15, + "token_acc": 0.8007322175732218, + "train_speed(iter/s)": 0.370534 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.970768928527832, + "learning_rate": 4e-05, + "loss": 1.0443785667419434, + "memory(GiB)": 98.3, + "step": 20, + "token_acc": 0.8557748113755078, + "train_speed(iter/s)": 0.390742 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.6980665922164917, + "eval_runtime": 1.2677, + "eval_samples_per_second": 3.155, + "eval_steps_per_second": 3.155, + "eval_token_acc": 0.7106299212598425, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.24524027109146118, + "learning_rate": 5e-05, + "loss": 0.5366989612579346, + "memory(GiB)": 108.29, + "step": 25, + "token_acc": 0.8295410346168806, + "train_speed(iter/s)": 0.358309 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.46116694808006287, + "learning_rate": 6e-05, + "loss": 0.6157921314239502, + "memory(GiB)": 108.29, + "step": 30, + "token_acc": 0.8143241564893396, + "train_speed(iter/s)": 0.372063 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.15954609215259552, + "learning_rate": 7e-05, + "loss": 0.3712780952453613, + "memory(GiB)": 108.29, + "step": 35, + "token_acc": 0.8573959255978743, + "train_speed(iter/s)": 0.380317 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.11871866136789322, + "learning_rate": 8e-05, + "loss": 0.44252305030822753, + "memory(GiB)": 108.29, + "step": 40, + "token_acc": 0.8448576409064498, + "train_speed(iter/s)": 0.382931 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.6573391556739807, + "eval_runtime": 1.307, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 3.06, + "eval_token_acc": 0.7431102362204725, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.21376259624958038, + "learning_rate": 9e-05, + "loss": 0.45000429153442384, + "memory(GiB)": 108.29, + "step": 45, + "token_acc": 0.8373225152129817, + "train_speed(iter/s)": 0.369324 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.1434803009033203, + "learning_rate": 0.0001, + "loss": 0.5050764560699463, + "memory(GiB)": 118.38, + "step": 50, + "token_acc": 0.8796895213454075, + "train_speed(iter/s)": 0.359356 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.19765082001686096, + "learning_rate": 9.999301905929286e-05, + "loss": 0.45570597648620603, + "memory(GiB)": 118.38, + "step": 55, + "token_acc": 0.8511966701352758, + "train_speed(iter/s)": 0.35956 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.19557587802410126, + "learning_rate": 9.997207818651274e-05, + "loss": 0.3813853025436401, + "memory(GiB)": 118.38, + "step": 60, + "token_acc": 0.8593534125449019, + "train_speed(iter/s)": 0.367382 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5886135101318359, + "eval_runtime": 1.2814, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 3.122, + "eval_token_acc": 0.7509842519685039, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.2849789559841156, + "learning_rate": 9.99371832291393e-05, + "loss": 0.550228500366211, + "memory(GiB)": 118.38, + "step": 65, + "token_acc": 0.826555830150528, + "train_speed(iter/s)": 0.36018 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.1136852502822876, + "learning_rate": 9.988834393115767e-05, + "loss": 0.41207499504089357, + "memory(GiB)": 118.38, + "step": 70, + "token_acc": 0.8746200607902735, + "train_speed(iter/s)": 0.357839 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.17141863703727722, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5810668468475342, + "memory(GiB)": 118.38, + "step": 75, + "token_acc": 0.8287886733088621, + "train_speed(iter/s)": 0.364223 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.170815110206604, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6472435474395752, + "memory(GiB)": 118.38, + "step": 80, + "token_acc": 0.841726618705036, + "train_speed(iter/s)": 0.366011 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5714729428291321, + "eval_runtime": 1.2893, + "eval_samples_per_second": 3.103, + "eval_steps_per_second": 3.103, + "eval_token_acc": 0.7509842519685039, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.1489511877298355, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5299827575683593, + "memory(GiB)": 118.38, + "step": 85, + "token_acc": 0.8189440290052119, + "train_speed(iter/s)": 0.361053 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.33635759353637695, + "learning_rate": 9.9553874407739e-05, + "loss": 0.4386009693145752, + "memory(GiB)": 118.38, + "step": 90, + "token_acc": 0.8471794871794872, + "train_speed(iter/s)": 0.363532 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.13267917931079865, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3524549722671509, + "memory(GiB)": 118.38, + "step": 95, + "token_acc": 0.8693969284554875, + "train_speed(iter/s)": 0.362313 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1372370421886444, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4188431739807129, + "memory(GiB)": 132.92, + "step": 100, + "token_acc": 0.8678135405105438, + "train_speed(iter/s)": 0.35826 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.565014660358429, + "eval_runtime": 1.3087, + "eval_samples_per_second": 3.057, + "eval_steps_per_second": 3.057, + "eval_token_acc": 0.7588582677165354, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.18173210322856903, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5471820831298828, + "memory(GiB)": 132.92, + "step": 105, + "token_acc": 0.8052507836990596, + "train_speed(iter/s)": 0.356931 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.5932771563529968, + "learning_rate": 9.899808525182935e-05, + "loss": 0.45682454109191895, + "memory(GiB)": 132.92, + "step": 110, + "token_acc": 0.8191489361702128, + "train_speed(iter/s)": 0.362578 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.10078100115060806, + "learning_rate": 9.882482608435923e-05, + "loss": 0.44985551834106446, + "memory(GiB)": 132.92, + "step": 115, + "token_acc": 0.8674093690073966, + "train_speed(iter/s)": 0.357889 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.299698144197464, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5030841827392578, + "memory(GiB)": 132.92, + "step": 120, + "token_acc": 0.8129452223041022, + "train_speed(iter/s)": 0.357367 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.5346428155899048, + "eval_runtime": 1.2611, + "eval_samples_per_second": 3.172, + "eval_steps_per_second": 3.172, + "eval_token_acc": 0.7608267716535433, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.20031973719596863, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4732785701751709, + "memory(GiB)": 132.92, + "step": 125, + "token_acc": 0.8376591971626632, + "train_speed(iter/s)": 0.356721 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.21032185852527618, + "learning_rate": 9.822345875271883e-05, + "loss": 0.48586230278015136, + "memory(GiB)": 132.92, + "step": 130, + "token_acc": 0.8409764190069914, + "train_speed(iter/s)": 0.3569 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.10959025472402573, + "learning_rate": 9.799599295015154e-05, + "loss": 0.362222957611084, + "memory(GiB)": 132.92, + "step": 135, + "token_acc": 0.8709891275523733, + "train_speed(iter/s)": 0.356829 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.20188008248806, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5328386306762696, + "memory(GiB)": 132.92, + "step": 140, + "token_acc": 0.8266873144921926, + "train_speed(iter/s)": 0.35701 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.49480319023132324, + "eval_runtime": 1.2596, + "eval_samples_per_second": 3.176, + "eval_steps_per_second": 3.176, + "eval_token_acc": 0.7519685039370079, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.21190020442008972, + "learning_rate": 9.750092174273521e-05, + "loss": 0.3609034061431885, + "memory(GiB)": 132.92, + "step": 145, + "token_acc": 0.856077862911576, + "train_speed(iter/s)": 0.353392 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.42249825596809387, + "learning_rate": 9.723345458039594e-05, + "loss": 0.37536747455596925, + "memory(GiB)": 132.92, + "step": 150, + "token_acc": 0.8812260536398467, + "train_speed(iter/s)": 0.354546 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.2383793443441391, + "learning_rate": 9.69527980602239e-05, + "loss": 0.41838736534118653, + "memory(GiB)": 132.92, + "step": 155, + "token_acc": 0.857667360176233, + "train_speed(iter/s)": 0.354581 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.20167972147464752, + "learning_rate": 9.665903055208014e-05, + "loss": 0.3394253969192505, + "memory(GiB)": 132.92, + "step": 160, + "token_acc": 0.8810381038103811, + "train_speed(iter/s)": 0.356298 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.48000994324684143, + "eval_runtime": 1.2638, + "eval_samples_per_second": 3.165, + "eval_steps_per_second": 3.165, + "eval_token_acc": 0.7549212598425197, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1416829228401184, + "learning_rate": 9.635223408690688e-05, + "loss": 0.41873645782470703, + "memory(GiB)": 132.92, + "step": 165, + "token_acc": 0.8491237317169588, + "train_speed(iter/s)": 0.354225 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.20265255868434906, + "learning_rate": 9.603249433382144e-05, + "loss": 0.4564688205718994, + "memory(GiB)": 132.92, + "step": 170, + "token_acc": 0.8519195612431444, + "train_speed(iter/s)": 0.35386 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.282372385263443, + "learning_rate": 9.569990057619414e-05, + "loss": 0.41248092651367185, + "memory(GiB)": 132.92, + "step": 175, + "token_acc": 0.8566410170625627, + "train_speed(iter/s)": 0.355591 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.1956530064344406, + "learning_rate": 9.535454568671704e-05, + "loss": 0.4143404006958008, + "memory(GiB)": 132.92, + "step": 180, + "token_acc": 0.8632127625967462, + "train_speed(iter/s)": 0.356508 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.4819040894508362, + "eval_runtime": 1.2641, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.7637795275590551, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.25014156103134155, + "learning_rate": 9.49965261014704e-05, + "loss": 0.4956723690032959, + "memory(GiB)": 132.92, + "step": 185, + "token_acc": 0.8175658720200752, + "train_speed(iter/s)": 0.355286 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 3.0596258640289307, + "learning_rate": 9.462594179299406e-05, + "loss": 0.5832932949066162, + "memory(GiB)": 132.92, + "step": 190, + "token_acc": 0.8024002232765839, + "train_speed(iter/s)": 0.357668 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.1552937924861908, + "learning_rate": 9.424289624237144e-05, + "loss": 0.559151029586792, + "memory(GiB)": 132.92, + "step": 195, + "token_acc": 0.8146739738284309, + "train_speed(iter/s)": 0.3574 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.15638215839862823, + "learning_rate": 9.384749641033359e-05, + "loss": 0.5025578498840332, + "memory(GiB)": 132.92, + "step": 200, + "token_acc": 0.8483137494277431, + "train_speed(iter/s)": 0.353969 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4734075665473938, + "eval_runtime": 1.2465, + "eval_samples_per_second": 3.209, + "eval_steps_per_second": 3.209, + "eval_token_acc": 0.7667322834645669, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.16425618529319763, + "learning_rate": 9.343985270739182e-05, + "loss": 0.45037288665771485, + "memory(GiB)": 132.92, + "step": 205, + "token_acc": 0.835399107585523, + "train_speed(iter/s)": 0.350091 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.21121171116828918, + "learning_rate": 9.302007896300698e-05, + "loss": 0.42234115600585936, + "memory(GiB)": 132.92, + "step": 210, + "token_acc": 0.8614671060661541, + "train_speed(iter/s)": 0.350171 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.18083210289478302, + "learning_rate": 9.25882923938038e-05, + "loss": 0.3796833992004395, + "memory(GiB)": 132.92, + "step": 215, + "token_acc": 0.8746039856923863, + "train_speed(iter/s)": 0.349555 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.30925488471984863, + "learning_rate": 9.214461357083985e-05, + "loss": 0.3580619812011719, + "memory(GiB)": 132.92, + "step": 220, + "token_acc": 0.8796054540179866, + "train_speed(iter/s)": 0.350637 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4918195605278015, + "eval_runtime": 1.2455, + "eval_samples_per_second": 3.212, + "eval_steps_per_second": 3.212, + "eval_token_acc": 0.7627952755905512, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.36294564604759216, + "learning_rate": 9.168916638593736e-05, + "loss": 0.4854443550109863, + "memory(GiB)": 132.92, + "step": 225, + "token_acc": 0.8444093422091843, + "train_speed(iter/s)": 0.349163 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.2092873752117157, + "learning_rate": 9.122207801708802e-05, + "loss": 0.39215381145477296, + "memory(GiB)": 132.92, + "step": 230, + "token_acc": 0.8688397695020211, + "train_speed(iter/s)": 0.347175 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.24130743741989136, + "learning_rate": 9.074347889294016e-05, + "loss": 0.20322649478912352, + "memory(GiB)": 132.92, + "step": 235, + "token_acc": 0.9091831557584982, + "train_speed(iter/s)": 0.349366 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.4042667746543884, + "learning_rate": 9.025350265637815e-05, + "loss": 0.44439196586608887, + "memory(GiB)": 132.92, + "step": 240, + "token_acc": 0.8551265412070085, + "train_speed(iter/s)": 0.350626 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5139080882072449, + "eval_runtime": 1.2422, + "eval_samples_per_second": 3.22, + "eval_steps_per_second": 3.22, + "eval_token_acc": 0.7627952755905512, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.2574117183685303, + "learning_rate": 8.975228612720416e-05, + "loss": 0.29034128189086916, + "memory(GiB)": 132.92, + "step": 245, + "token_acc": 0.8568427855873324, + "train_speed(iter/s)": 0.350706 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.49813297390937805, + "learning_rate": 8.923996926393305e-05, + "loss": 0.4220092296600342, + "memory(GiB)": 132.92, + "step": 250, + "token_acc": 0.8524024422617468, + "train_speed(iter/s)": 0.352825 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.2515278458595276, + "learning_rate": 8.871669512471068e-05, + "loss": 0.37980899810791013, + "memory(GiB)": 132.92, + "step": 255, + "token_acc": 0.8598113725849281, + "train_speed(iter/s)": 0.351391 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.43190768361091614, + "learning_rate": 8.818260982736661e-05, + "loss": 0.38159129619598386, + "memory(GiB)": 132.92, + "step": 260, + "token_acc": 0.8543320676561961, + "train_speed(iter/s)": 0.352512 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4426303505897522, + "eval_runtime": 1.2477, + "eval_samples_per_second": 3.206, + "eval_steps_per_second": 3.206, + "eval_token_acc": 0.7706692913385826, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.12871959805488586, + "learning_rate": 8.763786250861256e-05, + "loss": 0.30707058906555174, + "memory(GiB)": 132.92, + "step": 265, + "token_acc": 0.8950704812745016, + "train_speed(iter/s)": 0.348922 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.2897852659225464, + "learning_rate": 8.708260528239788e-05, + "loss": 0.2806516170501709, + "memory(GiB)": 132.93, + "step": 270, + "token_acc": 0.8953946242081835, + "train_speed(iter/s)": 0.350022 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.2160148024559021, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2911843299865723, + "memory(GiB)": 132.93, + "step": 275, + "token_acc": 0.8955285818030916, + "train_speed(iter/s)": 0.349742 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.5827478170394897, + "learning_rate": 8.594118419389647e-05, + "loss": 0.41789636611938474, + "memory(GiB)": 132.93, + "step": 280, + "token_acc": 0.8685470085470085, + "train_speed(iter/s)": 0.350778 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.42635607719421387, + "eval_runtime": 1.244, + "eval_samples_per_second": 3.215, + "eval_steps_per_second": 3.215, + "eval_token_acc": 0.7736220472440944, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.5320525765419006, + "learning_rate": 8.535533905932738e-05, + "loss": 0.23049118518829345, + "memory(GiB)": 132.93, + "step": 285, + "token_acc": 0.8896126157010628, + "train_speed(iter/s)": 0.350607 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.4473183751106262, + "learning_rate": 8.475962138373213e-05, + "loss": 0.36380805969238283, + "memory(GiB)": 132.93, + "step": 290, + "token_acc": 0.8619923216811477, + "train_speed(iter/s)": 0.352023 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.30866506695747375, + "learning_rate": 8.415419751390155e-05, + "loss": 0.395569372177124, + "memory(GiB)": 132.93, + "step": 295, + "token_acc": 0.8510589842860397, + "train_speed(iter/s)": 0.353538 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.6120028495788574, + "learning_rate": 8.353923650696118e-05, + "loss": 0.32523369789123535, + "memory(GiB)": 132.93, + "step": 300, + "token_acc": 0.8855692530819435, + "train_speed(iter/s)": 0.353789 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.45433884859085083, + "eval_runtime": 1.2577, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 3.18, + "eval_token_acc": 0.7775590551181102, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.3755347430706024, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3657505035400391, + "memory(GiB)": 132.93, + "step": 305, + "token_acc": 0.8681001582425766, + "train_speed(iter/s)": 0.351379 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.33566632866859436, + "learning_rate": 8.228139257794012e-05, + "loss": 0.30150370597839354, + "memory(GiB)": 132.93, + "step": 310, + "token_acc": 0.900846170535908, + "train_speed(iter/s)": 0.352623 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.8437044024467468, + "learning_rate": 8.163886089321493e-05, + "loss": 0.3159091234207153, + "memory(GiB)": 132.93, + "step": 315, + "token_acc": 0.8779527559055118, + "train_speed(iter/s)": 0.353918 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.3801237642765045, + "learning_rate": 8.098749444801224e-05, + "loss": 0.33113200664520265, + "memory(GiB)": 132.93, + "step": 320, + "token_acc": 0.9020088943413587, + "train_speed(iter/s)": 0.354333 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.43398547172546387, + "eval_runtime": 1.2796, + "eval_samples_per_second": 3.126, + "eval_steps_per_second": 3.126, + "eval_token_acc": 0.7726377952755905, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.37100571393966675, + "learning_rate": 8.032747512835337e-05, + "loss": 0.34933011531829833, + "memory(GiB)": 132.93, + "step": 325, + "token_acc": 0.8527973927213471, + "train_speed(iter/s)": 0.353766 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.580182671546936, + "learning_rate": 7.965898723646776e-05, + "loss": 0.37286627292633057, + "memory(GiB)": 132.93, + "step": 330, + "token_acc": 0.8843691926491843, + "train_speed(iter/s)": 0.354876 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.3128257989883423, + "learning_rate": 7.898221743932888e-05, + "loss": 0.3794433116912842, + "memory(GiB)": 132.93, + "step": 335, + "token_acc": 0.8727474355420016, + "train_speed(iter/s)": 0.354891 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.8151857852935791, + "learning_rate": 7.829735471652978e-05, + "loss": 0.262727689743042, + "memory(GiB)": 132.93, + "step": 340, + "token_acc": 0.9075043630017452, + "train_speed(iter/s)": 0.355958 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.37076544761657715, + "eval_runtime": 1.2578, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 3.18, + "eval_token_acc": 0.7755905511811023, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.32429400086402893, + "learning_rate": 7.760459030751284e-05, + "loss": 0.28121564388275144, + "memory(GiB)": 132.93, + "step": 345, + "token_acc": 0.8951330717845404, + "train_speed(iter/s)": 0.354629 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.0614266395568848, + "learning_rate": 7.690411765816864e-05, + "loss": 0.20163230895996093, + "memory(GiB)": 132.93, + "step": 350, + "token_acc": 0.920251572327044, + "train_speed(iter/s)": 0.355616 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.626061737537384, + "learning_rate": 7.619613236681843e-05, + "loss": 0.4713843822479248, + "memory(GiB)": 132.93, + "step": 355, + "token_acc": 0.8495345016429354, + "train_speed(iter/s)": 0.355891 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.2624683976173401, + "learning_rate": 7.548083212959588e-05, + "loss": 0.25896482467651366, + "memory(GiB)": 132.93, + "step": 360, + "token_acc": 0.8964471929186371, + "train_speed(iter/s)": 0.356199 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.3961385488510132, + "eval_runtime": 1.2624, + "eval_samples_per_second": 3.168, + "eval_steps_per_second": 3.168, + "eval_token_acc": 0.7765748031496063, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.7176984548568726, + "learning_rate": 7.475841668524268e-05, + "loss": 0.38772385120391845, + "memory(GiB)": 132.93, + "step": 365, + "token_acc": 0.8543818727090969, + "train_speed(iter/s)": 0.356015 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.27225521206855774, + "learning_rate": 7.402908775933419e-05, + "loss": 0.34966278076171875, + "memory(GiB)": 132.93, + "step": 370, + "token_acc": 0.8757403751233959, + "train_speed(iter/s)": 0.356009 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.4194507300853729, + "learning_rate": 7.329304900794991e-05, + "loss": 0.4062873363494873, + "memory(GiB)": 132.93, + "step": 375, + "token_acc": 0.8658723605048956, + "train_speed(iter/s)": 0.355662 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.6045131683349609, + "learning_rate": 7.255050596080509e-05, + "loss": 0.352255654335022, + "memory(GiB)": 132.93, + "step": 380, + "token_acc": 0.8888125343595382, + "train_speed(iter/s)": 0.356078 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.38526344299316406, + "eval_runtime": 1.2874, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 3.107, + "eval_token_acc": 0.7726377952755905, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.16466927528381348, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3758531093597412, + "memory(GiB)": 132.93, + "step": 385, + "token_acc": 0.8816031376394166, + "train_speed(iter/s)": 0.354701 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.324630469083786, + "learning_rate": 7.104673812141675e-05, + "loss": 0.25691215991973876, + "memory(GiB)": 132.93, + "step": 390, + "token_acc": 0.8995055766356215, + "train_speed(iter/s)": 0.354307 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3074846863746643, + "learning_rate": 7.02859332377382e-05, + "loss": 0.2570985794067383, + "memory(GiB)": 132.93, + "step": 395, + "token_acc": 0.899527983816588, + "train_speed(iter/s)": 0.35515 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5683487057685852, + "learning_rate": 6.951946375817474e-05, + "loss": 0.22938873767852783, + "memory(GiB)": 132.93, + "step": 400, + "token_acc": 0.9419516786946972, + "train_speed(iter/s)": 0.356409 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3636195659637451, + "eval_runtime": 1.2559, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 3.185, + "eval_token_acc": 0.7677165354330708, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6048702001571655, + "learning_rate": 6.874754370984606e-05, + "loss": 0.15864256620407105, + "memory(GiB)": 132.93, + "step": 405, + "token_acc": 0.9092146454335103, + "train_speed(iter/s)": 0.355742 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.7089707851409912, + "learning_rate": 6.797038864187564e-05, + "loss": 0.3233179092407227, + "memory(GiB)": 132.93, + "step": 410, + "token_acc": 0.9176615891313298, + "train_speed(iter/s)": 0.355669 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.370557576417923, + "learning_rate": 6.718821556520151e-05, + "loss": 0.19509116411209107, + "memory(GiB)": 132.93, + "step": 415, + "token_acc": 0.9241970021413276, + "train_speed(iter/s)": 0.356398 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.2815419137477875, + "learning_rate": 6.640124289197845e-05, + "loss": 0.09936256408691406, + "memory(GiB)": 132.93, + "step": 420, + "token_acc": 0.9714867617107943, + "train_speed(iter/s)": 0.35724 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.37009483575820923, + "eval_runtime": 1.2502, + "eval_samples_per_second": 3.199, + "eval_steps_per_second": 3.199, + "eval_token_acc": 0.7736220472440944, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.6189222931861877, + "learning_rate": 6.560969037458933e-05, + "loss": 0.18864725828170775, + "memory(GiB)": 132.93, + "step": 425, + "token_acc": 0.8931275480489226, + "train_speed(iter/s)": 0.357016 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.4722209870815277, + "learning_rate": 6.481377904428171e-05, + "loss": 0.13603065013885499, + "memory(GiB)": 132.93, + "step": 430, + "token_acc": 0.9594075079149706, + "train_speed(iter/s)": 0.356484 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.25799301266670227, + "learning_rate": 6.401373114944781e-05, + "loss": 0.1884603261947632, + "memory(GiB)": 132.93, + "step": 435, + "token_acc": 0.9427539503386004, + "train_speed(iter/s)": 0.355717 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.847876787185669, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2138120174407959, + "memory(GiB)": 132.93, + "step": 440, + "token_acc": 0.9274074074074075, + "train_speed(iter/s)": 0.356335 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.378639817237854, + "eval_runtime": 1.236, + "eval_samples_per_second": 3.236, + "eval_steps_per_second": 3.236, + "eval_token_acc": 0.7736220472440944, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.38243409991264343, + "learning_rate": 6.240212037280966e-05, + "loss": 0.12016980648040772, + "memory(GiB)": 132.93, + "step": 445, + "token_acc": 0.9293953606287235, + "train_speed(iter/s)": 0.355702 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.717224657535553, + "learning_rate": 6.159100751337642e-05, + "loss": 0.2691728830337524, + "memory(GiB)": 132.93, + "step": 450, + "token_acc": 0.905337548819044, + "train_speed(iter/s)": 0.356304 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.24402710795402527, + "learning_rate": 6.077665800849568e-05, + "loss": 0.18497172594070435, + "memory(GiB)": 132.93, + "step": 455, + "token_acc": 0.9337628865979382, + "train_speed(iter/s)": 0.356536 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.246830552816391, + "learning_rate": 5.99592992551918e-05, + "loss": 0.19234393835067748, + "memory(GiB)": 132.93, + "step": 460, + "token_acc": 0.9360821581851625, + "train_speed(iter/s)": 0.356885 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3614441454410553, + "eval_runtime": 1.2634, + "eval_samples_per_second": 3.166, + "eval_steps_per_second": 3.166, + "eval_token_acc": 0.7716535433070866, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.48954588174819946, + "learning_rate": 5.913915949078452e-05, + "loss": 0.17323193550109864, + "memory(GiB)": 132.93, + "step": 465, + "token_acc": 0.8919093851132686, + "train_speed(iter/s)": 0.357128 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.5584018230438232, + "learning_rate": 5.831646772915651e-05, + "loss": 0.13749444484710693, + "memory(GiB)": 132.93, + "step": 470, + "token_acc": 0.9434219495569189, + "train_speed(iter/s)": 0.35673 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 1.301836371421814, + "learning_rate": 5.749145369680407e-05, + "loss": 0.20462331771850586, + "memory(GiB)": 132.93, + "step": 475, + "token_acc": 0.9227237949502678, + "train_speed(iter/s)": 0.357258 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.33143967390060425, + "learning_rate": 5.666434776868895e-05, + "loss": 0.20529029369354249, + "memory(GiB)": 132.93, + "step": 480, + "token_acc": 0.9261273320505312, + "train_speed(iter/s)": 0.356138 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.3671746850013733, + "eval_runtime": 1.2559, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 3.185, + "eval_token_acc": 0.7618110236220472, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.6394232511520386, + "learning_rate": 5.583538090390882e-05, + "loss": 0.148415470123291, + "memory(GiB)": 132.93, + "step": 485, + "token_acc": 0.8970821081203347, + "train_speed(iter/s)": 0.356392 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5502394437789917, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.2857876539230347, + "memory(GiB)": 132.93, + "step": 490, + "token_acc": 0.895648670427075, + "train_speed(iter/s)": 0.356615 + }, + { + "epoch": 2.5, + "grad_norm": 0.2799193263053894, + "learning_rate": 5.41727907343245e-05, + "loss": 0.15979899168014527, + "memory(GiB)": 132.93, + "step": 495, + "token_acc": 0.9382183908045977, + "train_speed(iter/s)": 0.357352 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.47879472374916077, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.23016483783721925, + "memory(GiB)": 132.93, + "step": 500, + "token_acc": 0.8928835262250677, + "train_speed(iter/s)": 0.357166 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.36765411496162415, + "eval_runtime": 1.2648, + "eval_samples_per_second": 3.162, + "eval_steps_per_second": 3.162, + "eval_token_acc": 0.7706692913385826, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.5626565217971802, + "learning_rate": 5.250554008935596e-05, + "loss": 0.15926196575164794, + "memory(GiB)": 132.93, + "step": 505, + "token_acc": 0.90470706779905, + "train_speed(iter/s)": 0.356766 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.7293064594268799, + "learning_rate": 5.167074885038373e-05, + "loss": 0.15416876077651978, + "memory(GiB)": 132.93, + "step": 510, + "token_acc": 0.9385658067337123, + "train_speed(iter/s)": 0.357626 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.32495784759521484, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.18747940063476562, + "memory(GiB)": 132.93, + "step": 515, + "token_acc": 0.9271042471042471, + "train_speed(iter/s)": 0.358064 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.372670978307724, + "learning_rate": 5e-05, + "loss": 0.19285820722579955, + "memory(GiB)": 132.93, + "step": 520, + "token_acc": 0.9182068423122296, + "train_speed(iter/s)": 0.358116 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.35743266344070435, + "eval_runtime": 1.2997, + "eval_samples_per_second": 3.078, + "eval_steps_per_second": 3.078, + "eval_token_acc": 0.7687007874015748, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.4129459857940674, + "learning_rate": 4.916450892453495e-05, + "loss": 0.1654897451400757, + "memory(GiB)": 132.93, + "step": 525, + "token_acc": 0.9160741885625966, + "train_speed(iter/s)": 0.357945 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3547585904598236, + "learning_rate": 4.832925114961629e-05, + "loss": 0.22736096382141113, + "memory(GiB)": 132.93, + "step": 530, + "token_acc": 0.9109231599784056, + "train_speed(iter/s)": 0.357177 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7628602385520935, + "learning_rate": 4.749445991064404e-05, + "loss": 0.16493122577667235, + "memory(GiB)": 132.93, + "step": 535, + "token_acc": 0.9458256432526327, + "train_speed(iter/s)": 0.356557 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.9734154343605042, + "learning_rate": 4.666036831274392e-05, + "loss": 0.2907134771347046, + "memory(GiB)": 132.93, + "step": 540, + "token_acc": 0.8969603297269448, + "train_speed(iter/s)": 0.356129 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.33516690135002136, + "eval_runtime": 1.2565, + "eval_samples_per_second": 3.183, + "eval_steps_per_second": 3.183, + "eval_token_acc": 0.7716535433070866, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.566674530506134, + "learning_rate": 4.582720926567552e-05, + "loss": 0.2214029312133789, + "memory(GiB)": 132.93, + "step": 545, + "token_acc": 0.8890953431657183, + "train_speed(iter/s)": 0.355557 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.42045527696609497, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.17021151781082153, + "memory(GiB)": 132.93, + "step": 550, + "token_acc": 0.9310242307120559, + "train_speed(iter/s)": 0.355665 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.6601650714874268, + "learning_rate": 4.416461909609119e-05, + "loss": 0.18070143461227417, + "memory(GiB)": 132.93, + "step": 555, + "token_acc": 0.939800327819997, + "train_speed(iter/s)": 0.356126 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.25845786929130554, + "learning_rate": 4.333565223131107e-05, + "loss": 0.15745289325714112, + "memory(GiB)": 132.93, + "step": 560, + "token_acc": 0.9293805736322005, + "train_speed(iter/s)": 0.356108 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.33242106437683105, + "eval_runtime": 1.2648, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7696850393700787, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.7115989923477173, + "learning_rate": 4.250854630319593e-05, + "loss": 0.20625925064086914, + "memory(GiB)": 132.93, + "step": 565, + "token_acc": 0.9046810317376075, + "train_speed(iter/s)": 0.355539 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6034452319145203, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.25417945384979246, + "memory(GiB)": 132.93, + "step": 570, + "token_acc": 0.9046015712682379, + "train_speed(iter/s)": 0.355222 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.9026182889938354, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.1719497799873352, + "memory(GiB)": 132.93, + "step": 575, + "token_acc": 0.9340673744920698, + "train_speed(iter/s)": 0.35529 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 2.0807785987854004, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.2509638786315918, + "memory(GiB)": 132.93, + "step": 580, + "token_acc": 0.8932173225232352, + "train_speed(iter/s)": 0.355904 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.3441176414489746, + "eval_runtime": 1.2647, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7696850393700787, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.8126916885375977, + "learning_rate": 3.922334199150432e-05, + "loss": 0.21597733497619628, + "memory(GiB)": 132.93, + "step": 585, + "token_acc": 0.8890164561806353, + "train_speed(iter/s)": 0.355876 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.702083945274353, + "learning_rate": 3.840899248662358e-05, + "loss": 0.20148119926452637, + "memory(GiB)": 132.93, + "step": 590, + "token_acc": 0.9332627118644068, + "train_speed(iter/s)": 0.355335 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.25534525513648987, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.2256376028060913, + "memory(GiB)": 132.93, + "step": 595, + "token_acc": 0.932657200811359, + "train_speed(iter/s)": 0.354817 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.8379502296447754, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.13529281616210936, + "memory(GiB)": 132.93, + "step": 600, + "token_acc": 0.9571852479864349, + "train_speed(iter/s)": 0.355006 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.30427584052085876, + "eval_runtime": 1.2499, + "eval_samples_per_second": 3.2, + "eval_steps_per_second": 3.2, + "eval_token_acc": 0.7706692913385826, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.82066644641024e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/training_args.bin b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e5f1859e5e9c8a7e2ce5f39b8b402501b009c17 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff8ef96f1eb3882c67481e0c535acbbf7662496ccc01eebb173c3afaaa0ebe8 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/README.md b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a60675872493fa836badef060116183aeb9785a8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "q_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7a9d3439280a4265e1978ef90d4f62b4bc944c82 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c31194d7edebe27aba4c2c453d64dedf7764ce2df1b0302e1212665f33000 +size 536991984 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/additional_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/args.json new file mode 100644 index 0000000000000000000000000000000000000000..8f99135ca18d6f9dbe39c51b5857595c8ee2bce9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/optimizer.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..65a8da5cf84c34bb98a4db7cdcc7756ad2bc2148 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:318a9f3f3405f612ac76008e0a254313b4b547d5e26e3abd682386123db2550e +size 1074499986 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/rng_state.pth b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b6bf9c61dc517091545d3696d2418e808ca7905 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd0200903f09e4b1af1372eaee6d869dbb03c649df9ccb0c99650e964caacfdd +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/scheduler.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f8882c16fb0abc091aaea5286781182c084d87d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b3a16451354ac84ec594942621c3011b01d575ac8a6b2fa4481b0291c904a7 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/trainer_state.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..266503ff7efab75b3d26fe9fd72ed88c4bcaf331 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/trainer_state.json @@ -0,0 +1,2473 @@ +{ + "best_metric": 0.30427584, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 990, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.15279237926006317, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.594012975692749, + "memory(GiB)": 71.9, + "step": 1, + "token_acc": 0.8394495412844036, + "train_speed(iter/s)": 0.211078 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.3137172758579254, + "learning_rate": 1e-05, + "loss": 0.7095059156417847, + "memory(GiB)": 81.34, + "step": 5, + "token_acc": 0.829162656400385, + "train_speed(iter/s)": 0.342487 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.1773456335067749, + "learning_rate": 2e-05, + "loss": 0.7598193645477295, + "memory(GiB)": 87.85, + "step": 10, + "token_acc": 0.7958339958657974, + "train_speed(iter/s)": 0.368643 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.19639591872692108, + "learning_rate": 3e-05, + "loss": 0.7279319763183594, + "memory(GiB)": 98.3, + "step": 15, + "token_acc": 0.8007322175732218, + "train_speed(iter/s)": 0.370534 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.970768928527832, + "learning_rate": 4e-05, + "loss": 1.0443785667419434, + "memory(GiB)": 98.3, + "step": 20, + "token_acc": 0.8557748113755078, + "train_speed(iter/s)": 0.390742 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.6980665922164917, + "eval_runtime": 1.2677, + "eval_samples_per_second": 3.155, + "eval_steps_per_second": 3.155, + "eval_token_acc": 0.7106299212598425, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.24524027109146118, + "learning_rate": 5e-05, + "loss": 0.5366989612579346, + "memory(GiB)": 108.29, + "step": 25, + "token_acc": 0.8295410346168806, + "train_speed(iter/s)": 0.358309 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.46116694808006287, + "learning_rate": 6e-05, + "loss": 0.6157921314239502, + "memory(GiB)": 108.29, + "step": 30, + "token_acc": 0.8143241564893396, + "train_speed(iter/s)": 0.372063 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.15954609215259552, + "learning_rate": 7e-05, + "loss": 0.3712780952453613, + "memory(GiB)": 108.29, + "step": 35, + "token_acc": 0.8573959255978743, + "train_speed(iter/s)": 0.380317 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.11871866136789322, + "learning_rate": 8e-05, + "loss": 0.44252305030822753, + "memory(GiB)": 108.29, + "step": 40, + "token_acc": 0.8448576409064498, + "train_speed(iter/s)": 0.382931 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.6573391556739807, + "eval_runtime": 1.307, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 3.06, + "eval_token_acc": 0.7431102362204725, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.21376259624958038, + "learning_rate": 9e-05, + "loss": 0.45000429153442384, + "memory(GiB)": 108.29, + "step": 45, + "token_acc": 0.8373225152129817, + "train_speed(iter/s)": 0.369324 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.1434803009033203, + "learning_rate": 0.0001, + "loss": 0.5050764560699463, + "memory(GiB)": 118.38, + "step": 50, + "token_acc": 0.8796895213454075, + "train_speed(iter/s)": 0.359356 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.19765082001686096, + "learning_rate": 9.999301905929286e-05, + "loss": 0.45570597648620603, + "memory(GiB)": 118.38, + "step": 55, + "token_acc": 0.8511966701352758, + "train_speed(iter/s)": 0.35956 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.19557587802410126, + "learning_rate": 9.997207818651274e-05, + "loss": 0.3813853025436401, + "memory(GiB)": 118.38, + "step": 60, + "token_acc": 0.8593534125449019, + "train_speed(iter/s)": 0.367382 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5886135101318359, + "eval_runtime": 1.2814, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 3.122, + "eval_token_acc": 0.7509842519685039, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.2849789559841156, + "learning_rate": 9.99371832291393e-05, + "loss": 0.550228500366211, + "memory(GiB)": 118.38, + "step": 65, + "token_acc": 0.826555830150528, + "train_speed(iter/s)": 0.36018 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.1136852502822876, + "learning_rate": 9.988834393115767e-05, + "loss": 0.41207499504089357, + "memory(GiB)": 118.38, + "step": 70, + "token_acc": 0.8746200607902735, + "train_speed(iter/s)": 0.357839 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.17141863703727722, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5810668468475342, + "memory(GiB)": 118.38, + "step": 75, + "token_acc": 0.8287886733088621, + "train_speed(iter/s)": 0.364223 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.170815110206604, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6472435474395752, + "memory(GiB)": 118.38, + "step": 80, + "token_acc": 0.841726618705036, + "train_speed(iter/s)": 0.366011 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5714729428291321, + "eval_runtime": 1.2893, + "eval_samples_per_second": 3.103, + "eval_steps_per_second": 3.103, + "eval_token_acc": 0.7509842519685039, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.1489511877298355, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5299827575683593, + "memory(GiB)": 118.38, + "step": 85, + "token_acc": 0.8189440290052119, + "train_speed(iter/s)": 0.361053 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.33635759353637695, + "learning_rate": 9.9553874407739e-05, + "loss": 0.4386009693145752, + "memory(GiB)": 118.38, + "step": 90, + "token_acc": 0.8471794871794872, + "train_speed(iter/s)": 0.363532 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.13267917931079865, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3524549722671509, + "memory(GiB)": 118.38, + "step": 95, + "token_acc": 0.8693969284554875, + "train_speed(iter/s)": 0.362313 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1372370421886444, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4188431739807129, + "memory(GiB)": 132.92, + "step": 100, + "token_acc": 0.8678135405105438, + "train_speed(iter/s)": 0.35826 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.565014660358429, + "eval_runtime": 1.3087, + "eval_samples_per_second": 3.057, + "eval_steps_per_second": 3.057, + "eval_token_acc": 0.7588582677165354, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.18173210322856903, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5471820831298828, + "memory(GiB)": 132.92, + "step": 105, + "token_acc": 0.8052507836990596, + "train_speed(iter/s)": 0.356931 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.5932771563529968, + "learning_rate": 9.899808525182935e-05, + "loss": 0.45682454109191895, + "memory(GiB)": 132.92, + "step": 110, + "token_acc": 0.8191489361702128, + "train_speed(iter/s)": 0.362578 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.10078100115060806, + "learning_rate": 9.882482608435923e-05, + "loss": 0.44985551834106446, + "memory(GiB)": 132.92, + "step": 115, + "token_acc": 0.8674093690073966, + "train_speed(iter/s)": 0.357889 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.299698144197464, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5030841827392578, + "memory(GiB)": 132.92, + "step": 120, + "token_acc": 0.8129452223041022, + "train_speed(iter/s)": 0.357367 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.5346428155899048, + "eval_runtime": 1.2611, + "eval_samples_per_second": 3.172, + "eval_steps_per_second": 3.172, + "eval_token_acc": 0.7608267716535433, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.20031973719596863, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4732785701751709, + "memory(GiB)": 132.92, + "step": 125, + "token_acc": 0.8376591971626632, + "train_speed(iter/s)": 0.356721 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.21032185852527618, + "learning_rate": 9.822345875271883e-05, + "loss": 0.48586230278015136, + "memory(GiB)": 132.92, + "step": 130, + "token_acc": 0.8409764190069914, + "train_speed(iter/s)": 0.3569 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.10959025472402573, + "learning_rate": 9.799599295015154e-05, + "loss": 0.362222957611084, + "memory(GiB)": 132.92, + "step": 135, + "token_acc": 0.8709891275523733, + "train_speed(iter/s)": 0.356829 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.20188008248806, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5328386306762696, + "memory(GiB)": 132.92, + "step": 140, + "token_acc": 0.8266873144921926, + "train_speed(iter/s)": 0.35701 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.49480319023132324, + "eval_runtime": 1.2596, + "eval_samples_per_second": 3.176, + "eval_steps_per_second": 3.176, + "eval_token_acc": 0.7519685039370079, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.21190020442008972, + "learning_rate": 9.750092174273521e-05, + "loss": 0.3609034061431885, + "memory(GiB)": 132.92, + "step": 145, + "token_acc": 0.856077862911576, + "train_speed(iter/s)": 0.353392 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.42249825596809387, + "learning_rate": 9.723345458039594e-05, + "loss": 0.37536747455596925, + "memory(GiB)": 132.92, + "step": 150, + "token_acc": 0.8812260536398467, + "train_speed(iter/s)": 0.354546 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.2383793443441391, + "learning_rate": 9.69527980602239e-05, + "loss": 0.41838736534118653, + "memory(GiB)": 132.92, + "step": 155, + "token_acc": 0.857667360176233, + "train_speed(iter/s)": 0.354581 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.20167972147464752, + "learning_rate": 9.665903055208014e-05, + "loss": 0.3394253969192505, + "memory(GiB)": 132.92, + "step": 160, + "token_acc": 0.8810381038103811, + "train_speed(iter/s)": 0.356298 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.48000994324684143, + "eval_runtime": 1.2638, + "eval_samples_per_second": 3.165, + "eval_steps_per_second": 3.165, + "eval_token_acc": 0.7549212598425197, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1416829228401184, + "learning_rate": 9.635223408690688e-05, + "loss": 0.41873645782470703, + "memory(GiB)": 132.92, + "step": 165, + "token_acc": 0.8491237317169588, + "train_speed(iter/s)": 0.354225 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.20265255868434906, + "learning_rate": 9.603249433382144e-05, + "loss": 0.4564688205718994, + "memory(GiB)": 132.92, + "step": 170, + "token_acc": 0.8519195612431444, + "train_speed(iter/s)": 0.35386 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.282372385263443, + "learning_rate": 9.569990057619414e-05, + "loss": 0.41248092651367185, + "memory(GiB)": 132.92, + "step": 175, + "token_acc": 0.8566410170625627, + "train_speed(iter/s)": 0.355591 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.1956530064344406, + "learning_rate": 9.535454568671704e-05, + "loss": 0.4143404006958008, + "memory(GiB)": 132.92, + "step": 180, + "token_acc": 0.8632127625967462, + "train_speed(iter/s)": 0.356508 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.4819040894508362, + "eval_runtime": 1.2641, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.7637795275590551, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.25014156103134155, + "learning_rate": 9.49965261014704e-05, + "loss": 0.4956723690032959, + "memory(GiB)": 132.92, + "step": 185, + "token_acc": 0.8175658720200752, + "train_speed(iter/s)": 0.355286 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 3.0596258640289307, + "learning_rate": 9.462594179299406e-05, + "loss": 0.5832932949066162, + "memory(GiB)": 132.92, + "step": 190, + "token_acc": 0.8024002232765839, + "train_speed(iter/s)": 0.357668 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.1552937924861908, + "learning_rate": 9.424289624237144e-05, + "loss": 0.559151029586792, + "memory(GiB)": 132.92, + "step": 195, + "token_acc": 0.8146739738284309, + "train_speed(iter/s)": 0.3574 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.15638215839862823, + "learning_rate": 9.384749641033359e-05, + "loss": 0.5025578498840332, + "memory(GiB)": 132.92, + "step": 200, + "token_acc": 0.8483137494277431, + "train_speed(iter/s)": 0.353969 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4734075665473938, + "eval_runtime": 1.2465, + "eval_samples_per_second": 3.209, + "eval_steps_per_second": 3.209, + "eval_token_acc": 0.7667322834645669, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.16425618529319763, + "learning_rate": 9.343985270739182e-05, + "loss": 0.45037288665771485, + "memory(GiB)": 132.92, + "step": 205, + "token_acc": 0.835399107585523, + "train_speed(iter/s)": 0.350091 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.21121171116828918, + "learning_rate": 9.302007896300698e-05, + "loss": 0.42234115600585936, + "memory(GiB)": 132.92, + "step": 210, + "token_acc": 0.8614671060661541, + "train_speed(iter/s)": 0.350171 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.18083210289478302, + "learning_rate": 9.25882923938038e-05, + "loss": 0.3796833992004395, + "memory(GiB)": 132.92, + "step": 215, + "token_acc": 0.8746039856923863, + "train_speed(iter/s)": 0.349555 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.30925488471984863, + "learning_rate": 9.214461357083985e-05, + "loss": 0.3580619812011719, + "memory(GiB)": 132.92, + "step": 220, + "token_acc": 0.8796054540179866, + "train_speed(iter/s)": 0.350637 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4918195605278015, + "eval_runtime": 1.2455, + "eval_samples_per_second": 3.212, + "eval_steps_per_second": 3.212, + "eval_token_acc": 0.7627952755905512, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.36294564604759216, + "learning_rate": 9.168916638593736e-05, + "loss": 0.4854443550109863, + "memory(GiB)": 132.92, + "step": 225, + "token_acc": 0.8444093422091843, + "train_speed(iter/s)": 0.349163 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.2092873752117157, + "learning_rate": 9.122207801708802e-05, + "loss": 0.39215381145477296, + "memory(GiB)": 132.92, + "step": 230, + "token_acc": 0.8688397695020211, + "train_speed(iter/s)": 0.347175 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.24130743741989136, + "learning_rate": 9.074347889294016e-05, + "loss": 0.20322649478912352, + "memory(GiB)": 132.92, + "step": 235, + "token_acc": 0.9091831557584982, + "train_speed(iter/s)": 0.349366 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.4042667746543884, + "learning_rate": 9.025350265637815e-05, + "loss": 0.44439196586608887, + "memory(GiB)": 132.92, + "step": 240, + "token_acc": 0.8551265412070085, + "train_speed(iter/s)": 0.350626 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5139080882072449, + "eval_runtime": 1.2422, + "eval_samples_per_second": 3.22, + "eval_steps_per_second": 3.22, + "eval_token_acc": 0.7627952755905512, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.2574117183685303, + "learning_rate": 8.975228612720416e-05, + "loss": 0.29034128189086916, + "memory(GiB)": 132.92, + "step": 245, + "token_acc": 0.8568427855873324, + "train_speed(iter/s)": 0.350706 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.49813297390937805, + "learning_rate": 8.923996926393305e-05, + "loss": 0.4220092296600342, + "memory(GiB)": 132.92, + "step": 250, + "token_acc": 0.8524024422617468, + "train_speed(iter/s)": 0.352825 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.2515278458595276, + "learning_rate": 8.871669512471068e-05, + "loss": 0.37980899810791013, + "memory(GiB)": 132.92, + "step": 255, + "token_acc": 0.8598113725849281, + "train_speed(iter/s)": 0.351391 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.43190768361091614, + "learning_rate": 8.818260982736661e-05, + "loss": 0.38159129619598386, + "memory(GiB)": 132.92, + "step": 260, + "token_acc": 0.8543320676561961, + "train_speed(iter/s)": 0.352512 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4426303505897522, + "eval_runtime": 1.2477, + "eval_samples_per_second": 3.206, + "eval_steps_per_second": 3.206, + "eval_token_acc": 0.7706692913385826, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.12871959805488586, + "learning_rate": 8.763786250861256e-05, + "loss": 0.30707058906555174, + "memory(GiB)": 132.92, + "step": 265, + "token_acc": 0.8950704812745016, + "train_speed(iter/s)": 0.348922 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.2897852659225464, + "learning_rate": 8.708260528239788e-05, + "loss": 0.2806516170501709, + "memory(GiB)": 132.93, + "step": 270, + "token_acc": 0.8953946242081835, + "train_speed(iter/s)": 0.350022 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.2160148024559021, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2911843299865723, + "memory(GiB)": 132.93, + "step": 275, + "token_acc": 0.8955285818030916, + "train_speed(iter/s)": 0.349742 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.5827478170394897, + "learning_rate": 8.594118419389647e-05, + "loss": 0.41789636611938474, + "memory(GiB)": 132.93, + "step": 280, + "token_acc": 0.8685470085470085, + "train_speed(iter/s)": 0.350778 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.42635607719421387, + "eval_runtime": 1.244, + "eval_samples_per_second": 3.215, + "eval_steps_per_second": 3.215, + "eval_token_acc": 0.7736220472440944, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.5320525765419006, + "learning_rate": 8.535533905932738e-05, + "loss": 0.23049118518829345, + "memory(GiB)": 132.93, + "step": 285, + "token_acc": 0.8896126157010628, + "train_speed(iter/s)": 0.350607 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.4473183751106262, + "learning_rate": 8.475962138373213e-05, + "loss": 0.36380805969238283, + "memory(GiB)": 132.93, + "step": 290, + "token_acc": 0.8619923216811477, + "train_speed(iter/s)": 0.352023 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.30866506695747375, + "learning_rate": 8.415419751390155e-05, + "loss": 0.395569372177124, + "memory(GiB)": 132.93, + "step": 295, + "token_acc": 0.8510589842860397, + "train_speed(iter/s)": 0.353538 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.6120028495788574, + "learning_rate": 8.353923650696118e-05, + "loss": 0.32523369789123535, + "memory(GiB)": 132.93, + "step": 300, + "token_acc": 0.8855692530819435, + "train_speed(iter/s)": 0.353789 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.45433884859085083, + "eval_runtime": 1.2577, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 3.18, + "eval_token_acc": 0.7775590551181102, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.3755347430706024, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3657505035400391, + "memory(GiB)": 132.93, + "step": 305, + "token_acc": 0.8681001582425766, + "train_speed(iter/s)": 0.351379 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.33566632866859436, + "learning_rate": 8.228139257794012e-05, + "loss": 0.30150370597839354, + "memory(GiB)": 132.93, + "step": 310, + "token_acc": 0.900846170535908, + "train_speed(iter/s)": 0.352623 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.8437044024467468, + "learning_rate": 8.163886089321493e-05, + "loss": 0.3159091234207153, + "memory(GiB)": 132.93, + "step": 315, + "token_acc": 0.8779527559055118, + "train_speed(iter/s)": 0.353918 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.3801237642765045, + "learning_rate": 8.098749444801224e-05, + "loss": 0.33113200664520265, + "memory(GiB)": 132.93, + "step": 320, + "token_acc": 0.9020088943413587, + "train_speed(iter/s)": 0.354333 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.43398547172546387, + "eval_runtime": 1.2796, + "eval_samples_per_second": 3.126, + "eval_steps_per_second": 3.126, + "eval_token_acc": 0.7726377952755905, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.37100571393966675, + "learning_rate": 8.032747512835337e-05, + "loss": 0.34933011531829833, + "memory(GiB)": 132.93, + "step": 325, + "token_acc": 0.8527973927213471, + "train_speed(iter/s)": 0.353766 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.580182671546936, + "learning_rate": 7.965898723646776e-05, + "loss": 0.37286627292633057, + "memory(GiB)": 132.93, + "step": 330, + "token_acc": 0.8843691926491843, + "train_speed(iter/s)": 0.354876 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.3128257989883423, + "learning_rate": 7.898221743932888e-05, + "loss": 0.3794433116912842, + "memory(GiB)": 132.93, + "step": 335, + "token_acc": 0.8727474355420016, + "train_speed(iter/s)": 0.354891 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.8151857852935791, + "learning_rate": 7.829735471652978e-05, + "loss": 0.262727689743042, + "memory(GiB)": 132.93, + "step": 340, + "token_acc": 0.9075043630017452, + "train_speed(iter/s)": 0.355958 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.37076544761657715, + "eval_runtime": 1.2578, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 3.18, + "eval_token_acc": 0.7755905511811023, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.32429400086402893, + "learning_rate": 7.760459030751284e-05, + "loss": 0.28121564388275144, + "memory(GiB)": 132.93, + "step": 345, + "token_acc": 0.8951330717845404, + "train_speed(iter/s)": 0.354629 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.0614266395568848, + "learning_rate": 7.690411765816864e-05, + "loss": 0.20163230895996093, + "memory(GiB)": 132.93, + "step": 350, + "token_acc": 0.920251572327044, + "train_speed(iter/s)": 0.355616 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.626061737537384, + "learning_rate": 7.619613236681843e-05, + "loss": 0.4713843822479248, + "memory(GiB)": 132.93, + "step": 355, + "token_acc": 0.8495345016429354, + "train_speed(iter/s)": 0.355891 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.2624683976173401, + "learning_rate": 7.548083212959588e-05, + "loss": 0.25896482467651366, + "memory(GiB)": 132.93, + "step": 360, + "token_acc": 0.8964471929186371, + "train_speed(iter/s)": 0.356199 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.3961385488510132, + "eval_runtime": 1.2624, + "eval_samples_per_second": 3.168, + "eval_steps_per_second": 3.168, + "eval_token_acc": 0.7765748031496063, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.7176984548568726, + "learning_rate": 7.475841668524268e-05, + "loss": 0.38772385120391845, + "memory(GiB)": 132.93, + "step": 365, + "token_acc": 0.8543818727090969, + "train_speed(iter/s)": 0.356015 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.27225521206855774, + "learning_rate": 7.402908775933419e-05, + "loss": 0.34966278076171875, + "memory(GiB)": 132.93, + "step": 370, + "token_acc": 0.8757403751233959, + "train_speed(iter/s)": 0.356009 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.4194507300853729, + "learning_rate": 7.329304900794991e-05, + "loss": 0.4062873363494873, + "memory(GiB)": 132.93, + "step": 375, + "token_acc": 0.8658723605048956, + "train_speed(iter/s)": 0.355662 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.6045131683349609, + "learning_rate": 7.255050596080509e-05, + "loss": 0.352255654335022, + "memory(GiB)": 132.93, + "step": 380, + "token_acc": 0.8888125343595382, + "train_speed(iter/s)": 0.356078 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.38526344299316406, + "eval_runtime": 1.2874, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 3.107, + "eval_token_acc": 0.7726377952755905, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.16466927528381348, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3758531093597412, + "memory(GiB)": 132.93, + "step": 385, + "token_acc": 0.8816031376394166, + "train_speed(iter/s)": 0.354701 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.324630469083786, + "learning_rate": 7.104673812141675e-05, + "loss": 0.25691215991973876, + "memory(GiB)": 132.93, + "step": 390, + "token_acc": 0.8995055766356215, + "train_speed(iter/s)": 0.354307 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3074846863746643, + "learning_rate": 7.02859332377382e-05, + "loss": 0.2570985794067383, + "memory(GiB)": 132.93, + "step": 395, + "token_acc": 0.899527983816588, + "train_speed(iter/s)": 0.35515 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5683487057685852, + "learning_rate": 6.951946375817474e-05, + "loss": 0.22938873767852783, + "memory(GiB)": 132.93, + "step": 400, + "token_acc": 0.9419516786946972, + "train_speed(iter/s)": 0.356409 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3636195659637451, + "eval_runtime": 1.2559, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 3.185, + "eval_token_acc": 0.7677165354330708, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6048702001571655, + "learning_rate": 6.874754370984606e-05, + "loss": 0.15864256620407105, + "memory(GiB)": 132.93, + "step": 405, + "token_acc": 0.9092146454335103, + "train_speed(iter/s)": 0.355742 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.7089707851409912, + "learning_rate": 6.797038864187564e-05, + "loss": 0.3233179092407227, + "memory(GiB)": 132.93, + "step": 410, + "token_acc": 0.9176615891313298, + "train_speed(iter/s)": 0.355669 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.370557576417923, + "learning_rate": 6.718821556520151e-05, + "loss": 0.19509116411209107, + "memory(GiB)": 132.93, + "step": 415, + "token_acc": 0.9241970021413276, + "train_speed(iter/s)": 0.356398 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.2815419137477875, + "learning_rate": 6.640124289197845e-05, + "loss": 0.09936256408691406, + "memory(GiB)": 132.93, + "step": 420, + "token_acc": 0.9714867617107943, + "train_speed(iter/s)": 0.35724 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.37009483575820923, + "eval_runtime": 1.2502, + "eval_samples_per_second": 3.199, + "eval_steps_per_second": 3.199, + "eval_token_acc": 0.7736220472440944, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.6189222931861877, + "learning_rate": 6.560969037458933e-05, + "loss": 0.18864725828170775, + "memory(GiB)": 132.93, + "step": 425, + "token_acc": 0.8931275480489226, + "train_speed(iter/s)": 0.357016 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.4722209870815277, + "learning_rate": 6.481377904428171e-05, + "loss": 0.13603065013885499, + "memory(GiB)": 132.93, + "step": 430, + "token_acc": 0.9594075079149706, + "train_speed(iter/s)": 0.356484 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.25799301266670227, + "learning_rate": 6.401373114944781e-05, + "loss": 0.1884603261947632, + "memory(GiB)": 132.93, + "step": 435, + "token_acc": 0.9427539503386004, + "train_speed(iter/s)": 0.355717 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.847876787185669, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2138120174407959, + "memory(GiB)": 132.93, + "step": 440, + "token_acc": 0.9274074074074075, + "train_speed(iter/s)": 0.356335 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.378639817237854, + "eval_runtime": 1.236, + "eval_samples_per_second": 3.236, + "eval_steps_per_second": 3.236, + "eval_token_acc": 0.7736220472440944, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.38243409991264343, + "learning_rate": 6.240212037280966e-05, + "loss": 0.12016980648040772, + "memory(GiB)": 132.93, + "step": 445, + "token_acc": 0.9293953606287235, + "train_speed(iter/s)": 0.355702 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.717224657535553, + "learning_rate": 6.159100751337642e-05, + "loss": 0.2691728830337524, + "memory(GiB)": 132.93, + "step": 450, + "token_acc": 0.905337548819044, + "train_speed(iter/s)": 0.356304 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.24402710795402527, + "learning_rate": 6.077665800849568e-05, + "loss": 0.18497172594070435, + "memory(GiB)": 132.93, + "step": 455, + "token_acc": 0.9337628865979382, + "train_speed(iter/s)": 0.356536 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.246830552816391, + "learning_rate": 5.99592992551918e-05, + "loss": 0.19234393835067748, + "memory(GiB)": 132.93, + "step": 460, + "token_acc": 0.9360821581851625, + "train_speed(iter/s)": 0.356885 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3614441454410553, + "eval_runtime": 1.2634, + "eval_samples_per_second": 3.166, + "eval_steps_per_second": 3.166, + "eval_token_acc": 0.7716535433070866, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.48954588174819946, + "learning_rate": 5.913915949078452e-05, + "loss": 0.17323193550109864, + "memory(GiB)": 132.93, + "step": 465, + "token_acc": 0.8919093851132686, + "train_speed(iter/s)": 0.357128 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.5584018230438232, + "learning_rate": 5.831646772915651e-05, + "loss": 0.13749444484710693, + "memory(GiB)": 132.93, + "step": 470, + "token_acc": 0.9434219495569189, + "train_speed(iter/s)": 0.35673 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 1.301836371421814, + "learning_rate": 5.749145369680407e-05, + "loss": 0.20462331771850586, + "memory(GiB)": 132.93, + "step": 475, + "token_acc": 0.9227237949502678, + "train_speed(iter/s)": 0.357258 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.33143967390060425, + "learning_rate": 5.666434776868895e-05, + "loss": 0.20529029369354249, + "memory(GiB)": 132.93, + "step": 480, + "token_acc": 0.9261273320505312, + "train_speed(iter/s)": 0.356138 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.3671746850013733, + "eval_runtime": 1.2559, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 3.185, + "eval_token_acc": 0.7618110236220472, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.6394232511520386, + "learning_rate": 5.583538090390882e-05, + "loss": 0.148415470123291, + "memory(GiB)": 132.93, + "step": 485, + "token_acc": 0.8970821081203347, + "train_speed(iter/s)": 0.356392 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5502394437789917, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.2857876539230347, + "memory(GiB)": 132.93, + "step": 490, + "token_acc": 0.895648670427075, + "train_speed(iter/s)": 0.356615 + }, + { + "epoch": 2.5, + "grad_norm": 0.2799193263053894, + "learning_rate": 5.41727907343245e-05, + "loss": 0.15979899168014527, + "memory(GiB)": 132.93, + "step": 495, + "token_acc": 0.9382183908045977, + "train_speed(iter/s)": 0.357352 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.47879472374916077, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.23016483783721925, + "memory(GiB)": 132.93, + "step": 500, + "token_acc": 0.8928835262250677, + "train_speed(iter/s)": 0.357166 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.36765411496162415, + "eval_runtime": 1.2648, + "eval_samples_per_second": 3.162, + "eval_steps_per_second": 3.162, + "eval_token_acc": 0.7706692913385826, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.5626565217971802, + "learning_rate": 5.250554008935596e-05, + "loss": 0.15926196575164794, + "memory(GiB)": 132.93, + "step": 505, + "token_acc": 0.90470706779905, + "train_speed(iter/s)": 0.356766 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.7293064594268799, + "learning_rate": 5.167074885038373e-05, + "loss": 0.15416876077651978, + "memory(GiB)": 132.93, + "step": 510, + "token_acc": 0.9385658067337123, + "train_speed(iter/s)": 0.357626 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.32495784759521484, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.18747940063476562, + "memory(GiB)": 132.93, + "step": 515, + "token_acc": 0.9271042471042471, + "train_speed(iter/s)": 0.358064 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.372670978307724, + "learning_rate": 5e-05, + "loss": 0.19285820722579955, + "memory(GiB)": 132.93, + "step": 520, + "token_acc": 0.9182068423122296, + "train_speed(iter/s)": 0.358116 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.35743266344070435, + "eval_runtime": 1.2997, + "eval_samples_per_second": 3.078, + "eval_steps_per_second": 3.078, + "eval_token_acc": 0.7687007874015748, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.4129459857940674, + "learning_rate": 4.916450892453495e-05, + "loss": 0.1654897451400757, + "memory(GiB)": 132.93, + "step": 525, + "token_acc": 0.9160741885625966, + "train_speed(iter/s)": 0.357945 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3547585904598236, + "learning_rate": 4.832925114961629e-05, + "loss": 0.22736096382141113, + "memory(GiB)": 132.93, + "step": 530, + "token_acc": 0.9109231599784056, + "train_speed(iter/s)": 0.357177 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7628602385520935, + "learning_rate": 4.749445991064404e-05, + "loss": 0.16493122577667235, + "memory(GiB)": 132.93, + "step": 535, + "token_acc": 0.9458256432526327, + "train_speed(iter/s)": 0.356557 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.9734154343605042, + "learning_rate": 4.666036831274392e-05, + "loss": 0.2907134771347046, + "memory(GiB)": 132.93, + "step": 540, + "token_acc": 0.8969603297269448, + "train_speed(iter/s)": 0.356129 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.33516690135002136, + "eval_runtime": 1.2565, + "eval_samples_per_second": 3.183, + "eval_steps_per_second": 3.183, + "eval_token_acc": 0.7716535433070866, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.566674530506134, + "learning_rate": 4.582720926567552e-05, + "loss": 0.2214029312133789, + "memory(GiB)": 132.93, + "step": 545, + "token_acc": 0.8890953431657183, + "train_speed(iter/s)": 0.355557 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.42045527696609497, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.17021151781082153, + "memory(GiB)": 132.93, + "step": 550, + "token_acc": 0.9310242307120559, + "train_speed(iter/s)": 0.355665 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.6601650714874268, + "learning_rate": 4.416461909609119e-05, + "loss": 0.18070143461227417, + "memory(GiB)": 132.93, + "step": 555, + "token_acc": 0.939800327819997, + "train_speed(iter/s)": 0.356126 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.25845786929130554, + "learning_rate": 4.333565223131107e-05, + "loss": 0.15745289325714112, + "memory(GiB)": 132.93, + "step": 560, + "token_acc": 0.9293805736322005, + "train_speed(iter/s)": 0.356108 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.33242106437683105, + "eval_runtime": 1.2648, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7696850393700787, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.7115989923477173, + "learning_rate": 4.250854630319593e-05, + "loss": 0.20625925064086914, + "memory(GiB)": 132.93, + "step": 565, + "token_acc": 0.9046810317376075, + "train_speed(iter/s)": 0.355539 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6034452319145203, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.25417945384979246, + "memory(GiB)": 132.93, + "step": 570, + "token_acc": 0.9046015712682379, + "train_speed(iter/s)": 0.355222 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.9026182889938354, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.1719497799873352, + "memory(GiB)": 132.93, + "step": 575, + "token_acc": 0.9340673744920698, + "train_speed(iter/s)": 0.35529 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 2.0807785987854004, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.2509638786315918, + "memory(GiB)": 132.93, + "step": 580, + "token_acc": 0.8932173225232352, + "train_speed(iter/s)": 0.355904 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.3441176414489746, + "eval_runtime": 1.2647, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7696850393700787, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.8126916885375977, + "learning_rate": 3.922334199150432e-05, + "loss": 0.21597733497619628, + "memory(GiB)": 132.93, + "step": 585, + "token_acc": 0.8890164561806353, + "train_speed(iter/s)": 0.355876 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.702083945274353, + "learning_rate": 3.840899248662358e-05, + "loss": 0.20148119926452637, + "memory(GiB)": 132.93, + "step": 590, + "token_acc": 0.9332627118644068, + "train_speed(iter/s)": 0.355335 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.25534525513648987, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.2256376028060913, + "memory(GiB)": 132.93, + "step": 595, + "token_acc": 0.932657200811359, + "train_speed(iter/s)": 0.354817 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.8379502296447754, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.13529281616210936, + "memory(GiB)": 132.93, + "step": 600, + "token_acc": 0.9571852479864349, + "train_speed(iter/s)": 0.355006 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.30427584052085876, + "eval_runtime": 1.2499, + "eval_samples_per_second": 3.2, + "eval_steps_per_second": 3.2, + "eval_token_acc": 0.7706692913385826, + "step": 600 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.5740740895271301, + "learning_rate": 3.598626885055219e-05, + "loss": 0.09742944836616516, + "memory(GiB)": 132.93, + "step": 605, + "token_acc": 0.9399038461538461, + "train_speed(iter/s)": 0.353865 + }, + { + "epoch": 3.080808080808081, + "grad_norm": 0.7276327610015869, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.05046466588973999, + "memory(GiB)": 132.93, + "step": 610, + "token_acc": 0.9742566354021154, + "train_speed(iter/s)": 0.354532 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.1619880646467209, + "learning_rate": 3.4390309625410686e-05, + "loss": 0.03918294310569763, + "memory(GiB)": 132.93, + "step": 615, + "token_acc": 0.9892593421347058, + "train_speed(iter/s)": 0.355134 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 0.4289223849773407, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.08534240126609802, + "memory(GiB)": 132.93, + "step": 620, + "token_acc": 0.9704122340425532, + "train_speed(iter/s)": 0.355115 + }, + { + "epoch": 3.1313131313131315, + "eval_loss": 0.3079659342765808, + "eval_runtime": 1.2475, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 3.207, + "eval_token_acc": 0.765748031496063, + "step": 620 + }, + { + "epoch": 3.1565656565656566, + "grad_norm": 0.4063633680343628, + "learning_rate": 3.281178443479852e-05, + "loss": 0.09771488904953003, + "memory(GiB)": 132.93, + "step": 625, + "token_acc": 0.9483132767804301, + "train_speed(iter/s)": 0.354176 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.7491307258605957, + "learning_rate": 3.202961135812437e-05, + "loss": 0.0691908597946167, + "memory(GiB)": 132.93, + "step": 630, + "token_acc": 0.9681883908283413, + "train_speed(iter/s)": 0.354714 + }, + { + "epoch": 3.207070707070707, + "grad_norm": 0.5784112215042114, + "learning_rate": 3.1252456290153954e-05, + "loss": 0.1324032187461853, + "memory(GiB)": 132.93, + "step": 635, + "token_acc": 0.9444506165981558, + "train_speed(iter/s)": 0.354589 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.4406881630420685, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.11127980947494506, + "memory(GiB)": 132.93, + "step": 640, + "token_acc": 0.9494873483209482, + "train_speed(iter/s)": 0.353946 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.30791735649108887, + "eval_runtime": 1.2574, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 3.181, + "eval_token_acc": 0.7716535433070866, + "step": 640 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.4303297996520996, + "learning_rate": 2.9714066762261823e-05, + "loss": 0.08342494964599609, + "memory(GiB)": 132.93, + "step": 645, + "token_acc": 0.9452848128619586, + "train_speed(iter/s)": 0.352926 + }, + { + "epoch": 3.282828282828283, + "grad_norm": 0.2603033781051636, + "learning_rate": 2.895326187858326e-05, + "loss": 0.1292866587638855, + "memory(GiB)": 132.93, + "step": 650, + "token_acc": 0.9530223943424819, + "train_speed(iter/s)": 0.353396 + }, + { + "epoch": 3.308080808080808, + "grad_norm": 0.4966074824333191, + "learning_rate": 2.8198334036140874e-05, + "loss": 0.09613170623779296, + "memory(GiB)": 132.93, + "step": 655, + "token_acc": 0.9691442468460252, + "train_speed(iter/s)": 0.353561 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.2868223190307617, + "learning_rate": 2.74494940391949e-05, + "loss": 0.09578182101249695, + "memory(GiB)": 132.93, + "step": 660, + "token_acc": 0.9634032634032634, + "train_speed(iter/s)": 0.353616 + }, + { + "epoch": 3.3333333333333335, + "eval_loss": 0.30913257598876953, + "eval_runtime": 1.251, + "eval_samples_per_second": 3.197, + "eval_steps_per_second": 3.197, + "eval_token_acc": 0.7706692913385826, + "step": 660 + }, + { + "epoch": 3.3585858585858586, + "grad_norm": 0.8172745108604431, + "learning_rate": 2.6706950992050094e-05, + "loss": 0.09090739488601685, + "memory(GiB)": 132.93, + "step": 665, + "token_acc": 0.9261056167195372, + "train_speed(iter/s)": 0.353647 + }, + { + "epoch": 3.3838383838383836, + "grad_norm": 0.8159921169281006, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.10645673274993897, + "memory(GiB)": 132.93, + "step": 670, + "token_acc": 0.9591964846202135, + "train_speed(iter/s)": 0.353871 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.35854482650756836, + "learning_rate": 2.5241583314757327e-05, + "loss": 0.07736409306526185, + "memory(GiB)": 132.93, + "step": 675, + "token_acc": 0.9623198471956937, + "train_speed(iter/s)": 0.354188 + }, + { + "epoch": 3.4343434343434343, + "grad_norm": 0.6301116943359375, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.12336930036544799, + "memory(GiB)": 132.93, + "step": 680, + "token_acc": 0.9413841807909604, + "train_speed(iter/s)": 0.354564 + }, + { + "epoch": 3.4343434343434343, + "eval_loss": 0.3097744286060333, + "eval_runtime": 1.3015, + "eval_samples_per_second": 3.073, + "eval_steps_per_second": 3.073, + "eval_token_acc": 0.764763779527559, + "step": 680 + }, + { + "epoch": 3.45959595959596, + "grad_norm": 0.2789129614830017, + "learning_rate": 2.3803867633181574e-05, + "loss": 0.11545860767364502, + "memory(GiB)": 132.93, + "step": 685, + "token_acc": 0.9139716761783978, + "train_speed(iter/s)": 0.353786 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.7183085083961487, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.1217241644859314, + "memory(GiB)": 132.93, + "step": 690, + "token_acc": 0.9524612272420768, + "train_speed(iter/s)": 0.353548 + }, + { + "epoch": 3.51010101010101, + "grad_norm": 0.3908962905406952, + "learning_rate": 2.2395409692487175e-05, + "loss": 0.11445937156677247, + "memory(GiB)": 132.93, + "step": 695, + "token_acc": 0.9617232295056563, + "train_speed(iter/s)": 0.353948 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.22522900998592377, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.0395620197057724, + "memory(GiB)": 132.93, + "step": 700, + "token_acc": 0.9872527472527473, + "train_speed(iter/s)": 0.354517 + }, + { + "epoch": 3.5353535353535355, + "eval_loss": 0.30946803092956543, + "eval_runtime": 1.3167, + "eval_samples_per_second": 3.038, + "eval_steps_per_second": 3.038, + "eval_token_acc": 0.7677165354330708, + "step": 700 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.40321338176727295, + "learning_rate": 2.1017782560671123e-05, + "loss": 0.07134841680526734, + "memory(GiB)": 132.93, + "step": 705, + "token_acc": 0.9328724758959432, + "train_speed(iter/s)": 0.354512 + }, + { + "epoch": 3.5858585858585856, + "grad_norm": 0.5617113709449768, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.09319761395454407, + "memory(GiB)": 132.93, + "step": 710, + "token_acc": 0.9647415777359281, + "train_speed(iter/s)": 0.354354 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.036976154893636703, + "learning_rate": 1.967252487164663e-05, + "loss": 0.0075861550867557526, + "memory(GiB)": 132.93, + "step": 715, + "token_acc": 0.9970123772940674, + "train_speed(iter/s)": 0.35527 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.630945086479187, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.0571125328540802, + "memory(GiB)": 132.93, + "step": 720, + "token_acc": 0.9803678212794765, + "train_speed(iter/s)": 0.355088 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.3143712282180786, + "eval_runtime": 1.27, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 3.15, + "eval_token_acc": 0.764763779527559, + "step": 720 + }, + { + "epoch": 3.6616161616161618, + "grad_norm": 0.5826022624969482, + "learning_rate": 1.836113910678507e-05, + "loss": 0.10505948066711426, + "memory(GiB)": 132.93, + "step": 725, + "token_acc": 0.9304705882352942, + "train_speed(iter/s)": 0.354673 + }, + { + "epoch": 3.686868686868687, + "grad_norm": 0.31852394342422485, + "learning_rate": 1.771860742205988e-05, + "loss": 0.11193917989730835, + "memory(GiB)": 132.93, + "step": 730, + "token_acc": 0.9555294348124204, + "train_speed(iter/s)": 0.354365 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.5509043335914612, + "learning_rate": 1.7085089916835923e-05, + "loss": 0.0963442325592041, + "memory(GiB)": 132.93, + "step": 735, + "token_acc": 0.9589075419694312, + "train_speed(iter/s)": 0.354253 + }, + { + "epoch": 3.7373737373737375, + "grad_norm": 0.714759349822998, + "learning_rate": 1.646076349303884e-05, + "loss": 0.12444987297058105, + "memory(GiB)": 132.93, + "step": 740, + "token_acc": 0.9618570602966673, + "train_speed(iter/s)": 0.354721 + }, + { + "epoch": 3.7373737373737375, + "eval_loss": 0.3131199777126312, + "eval_runtime": 1.2723, + "eval_samples_per_second": 3.144, + "eval_steps_per_second": 3.144, + "eval_token_acc": 0.7667322834645669, + "step": 740 + }, + { + "epoch": 3.7626262626262625, + "grad_norm": 0.47293511033058167, + "learning_rate": 1.584580248609846e-05, + "loss": 0.06069818139076233, + "memory(GiB)": 132.93, + "step": 745, + "token_acc": 0.9129849137931034, + "train_speed(iter/s)": 0.354907 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.7617067098617554, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.042090201377868654, + "memory(GiB)": 132.94, + "step": 750, + "token_acc": 0.9831313851271086, + "train_speed(iter/s)": 0.355507 + }, + { + "epoch": 3.813131313131313, + "grad_norm": 0.46223777532577515, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.18900134563446044, + "memory(GiB)": 132.94, + "step": 755, + "token_acc": 0.9208525754884547, + "train_speed(iter/s)": 0.354408 + }, + { + "epoch": 3.8383838383838382, + "grad_norm": 0.48695704340934753, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.01981939971446991, + "memory(GiB)": 132.94, + "step": 760, + "token_acc": 0.9939024390243902, + "train_speed(iter/s)": 0.355135 + }, + { + "epoch": 3.8383838383838382, + "eval_loss": 0.3126079738140106, + "eval_runtime": 1.248, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 3.205, + "eval_token_acc": 0.765748031496063, + "step": 760 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.6136153936386108, + "learning_rate": 1.3483006802566544e-05, + "loss": 0.1109347939491272, + "memory(GiB)": 132.94, + "step": 765, + "token_acc": 0.9204441740188621, + "train_speed(iter/s)": 0.354973 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.24277837574481964, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.06602246761322021, + "memory(GiB)": 132.94, + "step": 770, + "token_acc": 0.97854340362923, + "train_speed(iter/s)": 0.355116 + }, + { + "epoch": 3.9141414141414144, + "grad_norm": 0.574522852897644, + "learning_rate": 1.2362137491387432e-05, + "loss": 0.1160237193107605, + "memory(GiB)": 132.94, + "step": 775, + "token_acc": 0.9735529696236965, + "train_speed(iter/s)": 0.35515 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.409150630235672, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.09153335094451905, + "memory(GiB)": 132.94, + "step": 780, + "token_acc": 0.9558498896247241, + "train_speed(iter/s)": 0.355102 + }, + { + "epoch": 3.9393939393939394, + "eval_loss": 0.3179924190044403, + "eval_runtime": 1.2772, + "eval_samples_per_second": 3.132, + "eval_steps_per_second": 3.132, + "eval_token_acc": 0.765748031496063, + "step": 780 + }, + { + "epoch": 3.9646464646464645, + "grad_norm": 1.5299787521362305, + "learning_rate": 1.1283304875289336e-05, + "loss": 0.07547231316566468, + "memory(GiB)": 132.94, + "step": 785, + "token_acc": 0.9211635750421585, + "train_speed(iter/s)": 0.355 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 0.803305983543396, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.1015932559967041, + "memory(GiB)": 132.94, + "step": 790, + "token_acc": 0.9662865642042637, + "train_speed(iter/s)": 0.35536 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.4793414771556854, + "learning_rate": 1.024771387279585e-05, + "loss": 0.08443622589111328, + "memory(GiB)": 132.94, + "step": 795, + "token_acc": 0.9721483335941167, + "train_speed(iter/s)": 0.355607 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 0.348900705575943, + "learning_rate": 9.746497343621857e-06, + "loss": 0.035999318957328795, + "memory(GiB)": 132.94, + "step": 800, + "token_acc": 0.9878715180594428, + "train_speed(iter/s)": 0.355646 + }, + { + "epoch": 4.040404040404041, + "eval_loss": 0.3235396444797516, + "eval_runtime": 1.2641, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.764763779527559, + "step": 800 + }, + { + "epoch": 4.065656565656566, + "grad_norm": 0.6782447099685669, + "learning_rate": 9.256521107059834e-06, + "loss": 0.06672016382217408, + "memory(GiB)": 132.94, + "step": 805, + "token_acc": 0.9496715011776373, + "train_speed(iter/s)": 0.355197 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.02989630214869976, + "learning_rate": 8.777921982911996e-06, + "loss": 0.01867678463459015, + "memory(GiB)": 132.94, + "step": 810, + "token_acc": 0.9937345737611544, + "train_speed(iter/s)": 0.355631 + }, + { + "epoch": 4.116161616161616, + "grad_norm": 0.47457507252693176, + "learning_rate": 8.310833614062651e-06, + "loss": 0.046671625971794126, + "memory(GiB)": 132.94, + "step": 815, + "token_acc": 0.9823114975266077, + "train_speed(iter/s)": 0.355942 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.23747724294662476, + "learning_rate": 7.85538642916015e-06, + "loss": 0.03511995375156403, + "memory(GiB)": 132.94, + "step": 820, + "token_acc": 0.9881111606479418, + "train_speed(iter/s)": 0.355854 + }, + { + "epoch": 4.141414141414141, + "eval_loss": 0.3295816481113434, + "eval_runtime": 1.2496, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 3.201, + "eval_token_acc": 0.7608267716535433, + "step": 820 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.46623966097831726, + "learning_rate": 7.4117076061961885e-06, + "loss": 0.0608062744140625, + "memory(GiB)": 132.94, + "step": 825, + "token_acc": 0.9560709705002137, + "train_speed(iter/s)": 0.355301 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 0.3173864483833313, + "learning_rate": 6.979921036993042e-06, + "loss": 0.07521570324897767, + "memory(GiB)": 132.94, + "step": 830, + "token_acc": 0.9431046931407943, + "train_speed(iter/s)": 0.355358 + }, + { + "epoch": 4.217171717171717, + "grad_norm": 0.6492113471031189, + "learning_rate": 6.5601472926081766e-06, + "loss": 0.10097864866256714, + "memory(GiB)": 132.94, + "step": 835, + "token_acc": 0.9563927235195459, + "train_speed(iter/s)": 0.355449 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.45515960454940796, + "learning_rate": 6.152503589666425e-06, + "loss": 0.07692533135414123, + "memory(GiB)": 132.94, + "step": 840, + "token_acc": 0.9672113638790986, + "train_speed(iter/s)": 0.355338 + }, + { + "epoch": 4.242424242424242, + "eval_loss": 0.3356034457683563, + "eval_runtime": 1.2455, + "eval_samples_per_second": 3.211, + "eval_steps_per_second": 3.211, + "eval_token_acc": 0.7627952755905512, + "step": 840 + }, + { + "epoch": 4.267676767676767, + "grad_norm": 0.8457357287406921, + "learning_rate": 5.757103757628573e-06, + "loss": 0.14583762884140014, + "memory(GiB)": 132.94, + "step": 845, + "token_acc": 0.9234553884000378, + "train_speed(iter/s)": 0.354526 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 0.2833046019077301, + "learning_rate": 5.374058207005944e-06, + "loss": 0.10489171743392944, + "memory(GiB)": 132.94, + "step": 850, + "token_acc": 0.9607229402261712, + "train_speed(iter/s)": 0.354312 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.39615917205810547, + "learning_rate": 5.0034738985296095e-06, + "loss": 0.03666624426841736, + "memory(GiB)": 132.94, + "step": 855, + "token_acc": 0.9860793909733551, + "train_speed(iter/s)": 0.354264 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 0.20940014719963074, + "learning_rate": 4.645454313282965e-06, + "loss": 0.03459435701370239, + "memory(GiB)": 132.94, + "step": 860, + "token_acc": 0.9874883535205644, + "train_speed(iter/s)": 0.354279 + }, + { + "epoch": 4.343434343434343, + "eval_loss": 0.3347886800765991, + "eval_runtime": 1.2407, + "eval_samples_per_second": 3.224, + "eval_steps_per_second": 3.224, + "eval_token_acc": 0.7608267716535433, + "step": 860 + }, + { + "epoch": 4.3686868686868685, + "grad_norm": 0.45534172654151917, + "learning_rate": 4.3000994238058644e-06, + "loss": 0.042598605155944824, + "memory(GiB)": 132.94, + "step": 865, + "token_acc": 0.9554907934825805, + "train_speed(iter/s)": 0.354035 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.6204268336296082, + "learning_rate": 3.967505666178556e-06, + "loss": 0.04259795844554901, + "memory(GiB)": 132.94, + "step": 870, + "token_acc": 0.9884420519316023, + "train_speed(iter/s)": 0.354313 + }, + { + "epoch": 4.41919191919192, + "grad_norm": 0.40318140387535095, + "learning_rate": 3.647765913093132e-06, + "loss": 0.024170319736003875, + "memory(GiB)": 132.94, + "step": 875, + "token_acc": 0.9857792946530148, + "train_speed(iter/s)": 0.354708 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.6216991543769836, + "learning_rate": 3.340969447919873e-06, + "loss": 0.05446074604988098, + "memory(GiB)": 132.94, + "step": 880, + "token_acc": 0.9749515431638587, + "train_speed(iter/s)": 0.354846 + }, + { + "epoch": 4.444444444444445, + "eval_loss": 0.3343099355697632, + "eval_runtime": 1.2375, + "eval_samples_per_second": 3.232, + "eval_steps_per_second": 3.232, + "eval_token_acc": 0.7608267716535433, + "step": 880 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.325848788022995, + "learning_rate": 3.0472019397761064e-06, + "loss": 0.03617172241210938, + "memory(GiB)": 132.94, + "step": 885, + "token_acc": 0.9599399098647972, + "train_speed(iter/s)": 0.354631 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 0.3116080164909363, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.05567449927330017, + "memory(GiB)": 132.94, + "step": 890, + "token_acc": 0.9825274278748476, + "train_speed(iter/s)": 0.354558 + }, + { + "epoch": 4.52020202020202, + "grad_norm": 0.4584411084651947, + "learning_rate": 2.4990782572647975e-06, + "loss": 0.025903385877609254, + "memory(GiB)": 132.94, + "step": 895, + "token_acc": 0.9856645504812616, + "train_speed(iter/s)": 0.354943 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.6048192381858826, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.11551048755645751, + "memory(GiB)": 132.94, + "step": 900, + "token_acc": 0.9571796522858983, + "train_speed(iter/s)": 0.354798 + }, + { + "epoch": 4.545454545454545, + "eval_loss": 0.33555707335472107, + "eval_runtime": 1.2587, + "eval_samples_per_second": 3.178, + "eval_steps_per_second": 3.178, + "eval_token_acc": 0.7618110236220472, + "step": 900 + }, + { + "epoch": 4.570707070707071, + "grad_norm": 0.09098433703184128, + "learning_rate": 2.004007049848461e-06, + "loss": 0.009146060794591904, + "memory(GiB)": 132.94, + "step": 905, + "token_acc": 0.960080770425598, + "train_speed(iter/s)": 0.354755 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 0.12259010970592499, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.01627262681722641, + "memory(GiB)": 132.94, + "step": 910, + "token_acc": 0.995425667090216, + "train_speed(iter/s)": 0.355345 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.03434799984097481, + "learning_rate": 1.5625412489637337e-06, + "loss": 0.07969475984573364, + "memory(GiB)": 132.94, + "step": 915, + "token_acc": 0.9726568433844751, + "train_speed(iter/s)": 0.355547 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 0.23276808857917786, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.02643486261367798, + "memory(GiB)": 132.94, + "step": 920, + "token_acc": 0.9886294416243655, + "train_speed(iter/s)": 0.355964 + }, + { + "epoch": 4.646464646464646, + "eval_loss": 0.3315931558609009, + "eval_runtime": 1.2435, + "eval_samples_per_second": 3.217, + "eval_steps_per_second": 3.217, + "eval_token_acc": 0.7627952755905512, + "step": 920 + }, + { + "epoch": 4.671717171717171, + "grad_norm": 0.40637901425361633, + "learning_rate": 1.1751739156407649e-06, + "loss": 0.014962595701217652, + "memory(GiB)": 132.94, + "step": 925, + "token_acc": 0.9563416188655195, + "train_speed(iter/s)": 0.355848 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.04171985015273094, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.09390320181846619, + "memory(GiB)": 132.94, + "step": 930, + "token_acc": 0.9552877138413686, + "train_speed(iter/s)": 0.356157 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.45875418186187744, + "learning_rate": 8.423376898168245e-07, + "loss": 0.09355719685554505, + "memory(GiB)": 132.94, + "step": 935, + "token_acc": 0.9625611980416626, + "train_speed(iter/s)": 0.35577 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 0.06517396122217178, + "learning_rate": 6.964873004985717e-07, + "loss": 0.04375721216201782, + "memory(GiB)": 132.94, + "step": 940, + "token_acc": 0.9753479792050981, + "train_speed(iter/s)": 0.355994 + }, + { + "epoch": 4.747474747474747, + "eval_loss": 0.3333018124103546, + "eval_runtime": 1.2677, + "eval_samples_per_second": 3.155, + "eval_steps_per_second": 3.155, + "eval_token_acc": 0.7637795275590551, + "step": 940 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.24420230090618134, + "learning_rate": 5.644043071326932e-07, + "loss": 0.03980659544467926, + "memory(GiB)": 132.94, + "step": 945, + "token_acc": 0.9619817650094615, + "train_speed(iter/s)": 0.355016 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 0.031997114419937134, + "learning_rate": 4.461255922609986e-07, + "loss": 0.03921380043029785, + "memory(GiB)": 132.94, + "step": 950, + "token_acc": 0.973630831643002, + "train_speed(iter/s)": 0.354979 + }, + { + "epoch": 4.8232323232323235, + "grad_norm": 0.41990208625793457, + "learning_rate": 3.416841837512952e-07, + "loss": 0.016747798025608062, + "memory(GiB)": 132.94, + "step": 955, + "token_acc": 0.9951302974466965, + "train_speed(iter/s)": 0.355094 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.5648083090782166, + "learning_rate": 2.511092455747932e-07, + "loss": 0.08067426681518555, + "memory(GiB)": 132.94, + "step": 960, + "token_acc": 0.9631029789807954, + "train_speed(iter/s)": 0.355274 + }, + { + "epoch": 4.848484848484849, + "eval_loss": 0.33460578322410583, + "eval_runtime": 1.2582, + "eval_samples_per_second": 3.179, + "eval_steps_per_second": 3.179, + "eval_token_acc": 0.7588582677165354, + "step": 960 + }, + { + "epoch": 4.873737373737374, + "grad_norm": 0.4006679654121399, + "learning_rate": 1.7442606966242004e-07, + "loss": 0.043474048376083374, + "memory(GiB)": 132.94, + "step": 965, + "token_acc": 0.9473123191716156, + "train_speed(iter/s)": 0.35506 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 0.4876633584499359, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.04453698992729187, + "memory(GiB)": 132.94, + "step": 970, + "token_acc": 0.9867834131835453, + "train_speed(iter/s)": 0.355439 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.2543767988681793, + "learning_rate": 6.281677086071303e-08, + "loss": 0.028288286924362183, + "memory(GiB)": 132.94, + "step": 975, + "token_acc": 0.993429158110883, + "train_speed(iter/s)": 0.355931 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 0.14917303621768951, + "learning_rate": 2.792181348726941e-08, + "loss": 0.07211887836456299, + "memory(GiB)": 132.94, + "step": 980, + "token_acc": 0.9649309245483528, + "train_speed(iter/s)": 0.355974 + }, + { + "epoch": 4.94949494949495, + "eval_loss": 0.3347429037094116, + "eval_runtime": 1.245, + "eval_samples_per_second": 3.213, + "eval_steps_per_second": 3.213, + "eval_token_acc": 0.7598425196850394, + "step": 980 + }, + { + "epoch": 4.974747474747475, + "grad_norm": 0.3278864622116089, + "learning_rate": 6.980940707146389e-09, + "loss": 0.14155206680297852, + "memory(GiB)": 132.94, + "step": 985, + "token_acc": 0.9222326748196927, + "train_speed(iter/s)": 0.355457 + }, + { + "epoch": 5.0, + "grad_norm": 0.20631925761699677, + "learning_rate": 0.0, + "loss": 0.016276916861534117, + "memory(GiB)": 132.94, + "step": 990, + "token_acc": 0.9947106908158359, + "train_speed(iter/s)": 0.355697 + }, + { + "epoch": 5.0, + "eval_loss": 0.3333551585674286, + "eval_runtime": 1.2395, + "eval_samples_per_second": 3.227, + "eval_steps_per_second": 3.227, + "eval_token_acc": 0.7618110236220472, + "step": 990 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.99483427739392e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/training_args.bin b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e5f1859e5e9c8a7e2ce5f39b8b402501b009c17 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff8ef96f1eb3882c67481e0c535acbbf7662496ccc01eebb173c3afaaa0ebe8 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..aa784cab158ae107481b199e38eea8bcf4321b2d Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..be4c7a3e319bd575b284daf084067ee33e1ada22 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..5f40abf55d765431cbdac23c707f7398a2dec6b1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..dd7c460ee64bd066cb0f242008416a4d58da090c Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..fbce261a6ed959c53ed87a3239039e6aae459575 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..28af4713ddf3ee940ab6789f15d73de984b4c8ea Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..562fcbfb08c82810e9516c6451746deba1ded5ef Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..8977f64dd4afd86ccf4d0ec848be41268d0e90fc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1b313afb9a46fb736046ba18c08f98531e0c1934 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..14116e3ff2599b9a1e737aaa64098f15d44e48ae Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..216f7aa5298d26fe8c9f3c7b4065a273af875bf3 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..aab203dfde3af1a38b3e1e0e381e19be7e12833f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..5b850e0db6f8e140b3ce7497895d2fd36d95ec6f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..24313136f365f213d77e70d6cf3e4ce716dbfb2c Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b400d8d4eaadfee8019f06d14f93e17d87c098c7 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..0a34030b4fef9af072770ed54f75d705de01df58 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..43376697894269bbf00d3a26861fdc87f4d7b884 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/logging.jsonl b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c44738cbd433237622fb0dffffadc8492d1c7567 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/logging.jsonl @@ -0,0 +1,251 @@ +{"loss": 0.59401298, "token_acc": 0.83944954, "grad_norm": 0.15279238, "learning_rate": 2e-06, "memory(GiB)": 71.9, "train_speed(iter/s)": 0.211078, "epoch": 0.00505051, "global_step/max_steps": "1/990", "percentage": "0.10%", "elapsed_time": "4s", "remaining_time": "1h 12m 9s"} +{"loss": 0.70950592, "token_acc": 0.82916266, "grad_norm": 0.31371728, "learning_rate": 1e-05, "memory(GiB)": 81.34, "train_speed(iter/s)": 0.342487, "epoch": 0.02525253, "global_step/max_steps": "5/990", "percentage": "0.51%", "elapsed_time": "14s", "remaining_time": "46m 45s"} +{"loss": 0.75981936, "token_acc": 0.795834, "grad_norm": 0.17734563, "learning_rate": 2e-05, "memory(GiB)": 87.85, "train_speed(iter/s)": 0.368643, "epoch": 0.05050505, "global_step/max_steps": "10/990", "percentage": "1.01%", "elapsed_time": "26s", "remaining_time": "43m 43s"} +{"loss": 0.72793198, "token_acc": 0.80073222, "grad_norm": 0.19639592, "learning_rate": 3e-05, "memory(GiB)": 98.3, "train_speed(iter/s)": 0.370534, "epoch": 0.07575758, "global_step/max_steps": "15/990", "percentage": "1.52%", "elapsed_time": "40s", "remaining_time": "43m 27s"} +{"loss": 1.04437857, "token_acc": 0.85577481, "grad_norm": 2.97076893, "learning_rate": 4e-05, "memory(GiB)": 98.3, "train_speed(iter/s)": 0.390742, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "50s", "remaining_time": "41m 4s"} +{"eval_loss": 1.69806659, "eval_token_acc": 0.71062992, "eval_runtime": 1.2677, "eval_samples_per_second": 3.155, "eval_steps_per_second": 3.155, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "52s", "remaining_time": "42m 6s"} +{"loss": 0.53669896, "token_acc": 0.82954103, "grad_norm": 0.24524027, "learning_rate": 5e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.358309, "epoch": 0.12626263, "global_step/max_steps": "25/990", "percentage": "2.53%", "elapsed_time": "1m 9s", "remaining_time": "44m 39s"} +{"loss": 0.61579213, "token_acc": 0.81432416, "grad_norm": 0.46116695, "learning_rate": 6e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.372063, "epoch": 0.15151515, "global_step/max_steps": "30/990", "percentage": "3.03%", "elapsed_time": "1m 20s", "remaining_time": "42m 48s"} +{"loss": 0.3712781, "token_acc": 0.85739593, "grad_norm": 0.15954609, "learning_rate": 7e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.380317, "epoch": 0.17676768, "global_step/max_steps": "35/990", "percentage": "3.54%", "elapsed_time": "1m 31s", "remaining_time": "41m 41s"} +{"loss": 0.44252305, "token_acc": 0.84485764, "grad_norm": 0.11871866, "learning_rate": 8e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.382931, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "1m 44s", "remaining_time": "41m 12s"} +{"eval_loss": 0.65733916, "eval_token_acc": 0.74311024, "eval_runtime": 1.307, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "1m 45s", "remaining_time": "41m 43s"} +{"loss": 0.45000429, "token_acc": 0.83732252, "grad_norm": 0.2137626, "learning_rate": 9e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.369324, "epoch": 0.22727273, "global_step/max_steps": "45/990", "percentage": "4.55%", "elapsed_time": "2m 1s", "remaining_time": "42m 31s"} +{"loss": 0.50507646, "token_acc": 0.87968952, "grad_norm": 0.1434803, "learning_rate": 0.0001, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.359356, "epoch": 0.25252525, "global_step/max_steps": "50/990", "percentage": "5.05%", "elapsed_time": "2m 18s", "remaining_time": "43m 29s"} +{"loss": 0.45570598, "token_acc": 0.85119667, "grad_norm": 0.19765082, "learning_rate": 9.999e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.35956, "epoch": 0.27777778, "global_step/max_steps": "55/990", "percentage": "5.56%", "elapsed_time": "2m 32s", "remaining_time": "43m 14s"} +{"loss": 0.3813853, "token_acc": 0.85935341, "grad_norm": 0.19557588, "learning_rate": 9.997e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.367382, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "2m 42s", "remaining_time": "42m 5s"} +{"eval_loss": 0.58861351, "eval_token_acc": 0.75098425, "eval_runtime": 1.2814, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "2m 44s", "remaining_time": "42m 25s"} +{"loss": 0.5502285, "token_acc": 0.82655583, "grad_norm": 0.28497896, "learning_rate": 9.994e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.36018, "epoch": 0.32828283, "global_step/max_steps": "65/990", "percentage": "6.57%", "elapsed_time": "3m 0s", "remaining_time": "42m 43s"} +{"loss": 0.412075, "token_acc": 0.87462006, "grad_norm": 0.11368525, "learning_rate": 9.989e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.357839, "epoch": 0.35353535, "global_step/max_steps": "70/990", "percentage": "7.07%", "elapsed_time": "3m 15s", "remaining_time": "42m 46s"} +{"loss": 0.58106685, "token_acc": 0.82878867, "grad_norm": 0.17141864, "learning_rate": 9.983e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.364223, "epoch": 0.37878788, "global_step/max_steps": "75/990", "percentage": "7.58%", "elapsed_time": "3m 25s", "remaining_time": "41m 47s"} +{"loss": 0.64724355, "token_acc": 0.84172662, "grad_norm": 0.17081511, "learning_rate": 9.975e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.366011, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "3m 38s", "remaining_time": "41m 22s"} +{"eval_loss": 0.57147294, "eval_token_acc": 0.75098425, "eval_runtime": 1.2893, "eval_samples_per_second": 3.103, "eval_steps_per_second": 3.103, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "3m 39s", "remaining_time": "41m 36s"} +{"loss": 0.52998276, "token_acc": 0.81894403, "grad_norm": 0.14895119, "learning_rate": 9.966e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.361053, "epoch": 0.42929293, "global_step/max_steps": "85/990", "percentage": "8.59%", "elapsed_time": "3m 55s", "remaining_time": "41m 42s"} +{"loss": 0.43860097, "token_acc": 0.84717949, "grad_norm": 0.33635759, "learning_rate": 9.955e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.363532, "epoch": 0.45454545, "global_step/max_steps": "90/990", "percentage": "9.09%", "elapsed_time": "4m 7s", "remaining_time": "41m 12s"} +{"loss": 0.35245497, "token_acc": 0.86939693, "grad_norm": 0.13267918, "learning_rate": 9.944e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.362313, "epoch": 0.47979798, "global_step/max_steps": "95/990", "percentage": "9.60%", "elapsed_time": "4m 21s", "remaining_time": "41m 6s"} +{"loss": 0.41884317, "token_acc": 0.86781354, "grad_norm": 0.13723704, "learning_rate": 9.93e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35826, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "4m 38s", "remaining_time": "41m 21s"} +{"eval_loss": 0.56501466, "eval_token_acc": 0.75885827, "eval_runtime": 1.3087, "eval_samples_per_second": 3.057, "eval_steps_per_second": 3.057, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "4m 40s", "remaining_time": "41m 32s"} +{"loss": 0.54718208, "token_acc": 0.80525078, "grad_norm": 0.1817321, "learning_rate": 9.916e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356931, "epoch": 0.53030303, "global_step/max_steps": "105/990", "percentage": "10.61%", "elapsed_time": "4m 53s", "remaining_time": "41m 16s"} +{"loss": 0.45682454, "token_acc": 0.81914894, "grad_norm": 0.59327716, "learning_rate": 9.9e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.362578, "epoch": 0.55555556, "global_step/max_steps": "110/990", "percentage": "11.11%", "elapsed_time": "5m 3s", "remaining_time": "40m 24s"} +{"loss": 0.44985552, "token_acc": 0.86740937, "grad_norm": 0.100781, "learning_rate": 9.882e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357889, "epoch": 0.58080808, "global_step/max_steps": "115/990", "percentage": "11.62%", "elapsed_time": "5m 20s", "remaining_time": "40m 42s"} +{"loss": 0.50308418, "token_acc": 0.81294522, "grad_norm": 0.29969814, "learning_rate": 9.864e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357367, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "5m 35s", "remaining_time": "40m 31s"} +{"eval_loss": 0.53464282, "eval_token_acc": 0.76082677, "eval_runtime": 1.2611, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "5m 36s", "remaining_time": "40m 41s"} +{"loss": 0.47327857, "token_acc": 0.8376592, "grad_norm": 0.20031974, "learning_rate": 9.844e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356721, "epoch": 0.63131313, "global_step/max_steps": "125/990", "percentage": "12.63%", "elapsed_time": "5m 50s", "remaining_time": "40m 22s"} +{"loss": 0.4858623, "token_acc": 0.84097642, "grad_norm": 0.21032186, "learning_rate": 9.822e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.3569, "epoch": 0.65656566, "global_step/max_steps": "130/990", "percentage": "13.13%", "elapsed_time": "6m 3s", "remaining_time": "40m 7s"} +{"loss": 0.36222296, "token_acc": 0.87098913, "grad_norm": 0.10959025, "learning_rate": 9.8e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356829, "epoch": 0.68181818, "global_step/max_steps": "135/990", "percentage": "13.64%", "elapsed_time": "6m 17s", "remaining_time": "39m 53s"} +{"loss": 0.53283863, "token_acc": 0.82668731, "grad_norm": 0.20188008, "learning_rate": 9.776e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35701, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "6m 31s", "remaining_time": "39m 38s"} +{"eval_loss": 0.49480319, "eval_token_acc": 0.7519685, "eval_runtime": 1.2596, "eval_samples_per_second": 3.176, "eval_steps_per_second": 3.176, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "6m 33s", "remaining_time": "39m 46s"} +{"loss": 0.36090341, "token_acc": 0.85607786, "grad_norm": 0.2119002, "learning_rate": 9.75e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.353392, "epoch": 0.73232323, "global_step/max_steps": "145/990", "percentage": "14.65%", "elapsed_time": "6m 49s", "remaining_time": "39m 49s"} +{"loss": 0.37536747, "token_acc": 0.88122605, "grad_norm": 0.42249826, "learning_rate": 9.723e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354546, "epoch": 0.75757576, "global_step/max_steps": "150/990", "percentage": "15.15%", "elapsed_time": "7m 2s", "remaining_time": "39m 27s"} +{"loss": 0.41838737, "token_acc": 0.85766736, "grad_norm": 0.23837934, "learning_rate": 9.695e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354581, "epoch": 0.78282828, "global_step/max_steps": "155/990", "percentage": "15.66%", "elapsed_time": "7m 16s", "remaining_time": "39m 12s"} +{"loss": 0.3394254, "token_acc": 0.8810381, "grad_norm": 0.20167972, "learning_rate": 9.666e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356298, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "7m 28s", "remaining_time": "38m 47s"} +{"eval_loss": 0.48000994, "eval_token_acc": 0.75492126, "eval_runtime": 1.2638, "eval_samples_per_second": 3.165, "eval_steps_per_second": 3.165, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "7m 29s", "remaining_time": "38m 54s"} +{"loss": 0.41873646, "token_acc": 0.84912373, "grad_norm": 0.14168292, "learning_rate": 9.635e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354225, "epoch": 0.83333333, "global_step/max_steps": "165/990", "percentage": "16.67%", "elapsed_time": "7m 45s", "remaining_time": "38m 47s"} +{"loss": 0.45646882, "token_acc": 0.85191956, "grad_norm": 0.20265256, "learning_rate": 9.603e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35386, "epoch": 0.85858586, "global_step/max_steps": "170/990", "percentage": "17.17%", "elapsed_time": "8m 0s", "remaining_time": "38m 35s"} +{"loss": 0.41248093, "token_acc": 0.85664102, "grad_norm": 0.28237239, "learning_rate": 9.57e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.355591, "epoch": 0.88383838, "global_step/max_steps": "175/990", "percentage": "17.68%", "elapsed_time": "8m 11s", "remaining_time": "38m 10s"} +{"loss": 0.4143404, "token_acc": 0.86321276, "grad_norm": 0.19565301, "learning_rate": 9.535e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356508, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "8m 24s", "remaining_time": "37m 50s"} +{"eval_loss": 0.48190409, "eval_token_acc": 0.76377953, "eval_runtime": 1.2641, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "8m 25s", "remaining_time": "37m 56s"} +{"loss": 0.49567237, "token_acc": 0.81756587, "grad_norm": 0.25014156, "learning_rate": 9.5e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.355286, "epoch": 0.93434343, "global_step/max_steps": "185/990", "percentage": "18.69%", "elapsed_time": "8m 40s", "remaining_time": "37m 44s"} +{"loss": 0.58329329, "token_acc": 0.80240022, "grad_norm": 3.05962586, "learning_rate": 9.463e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357668, "epoch": 0.95959596, "global_step/max_steps": "190/990", "percentage": "19.19%", "elapsed_time": "8m 50s", "remaining_time": "37m 15s"} +{"loss": 0.55915103, "token_acc": 0.81467397, "grad_norm": 0.15529379, "learning_rate": 9.424e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.3574, "epoch": 0.98484848, "global_step/max_steps": "195/990", "percentage": "19.70%", "elapsed_time": "9m 5s", "remaining_time": "37m 2s"} +{"loss": 0.50255785, "token_acc": 0.84831375, "grad_norm": 0.15638216, "learning_rate": 9.385e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.353969, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "9m 24s", "remaining_time": "37m 10s"} +{"eval_loss": 0.47340757, "eval_token_acc": 0.76673228, "eval_runtime": 1.2465, "eval_samples_per_second": 3.209, "eval_steps_per_second": 3.209, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "9m 25s", "remaining_time": "37m 15s"} +{"loss": 0.45037289, "token_acc": 0.83539911, "grad_norm": 0.16425619, "learning_rate": 9.344e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350091, "epoch": 1.03535354, "global_step/max_steps": "205/990", "percentage": "20.71%", "elapsed_time": "9m 45s", "remaining_time": "37m 20s"} +{"loss": 0.42234116, "token_acc": 0.86146711, "grad_norm": 0.21121171, "learning_rate": 9.302e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350171, "epoch": 1.06060606, "global_step/max_steps": "210/990", "percentage": "21.21%", "elapsed_time": "9m 59s", "remaining_time": "37m 6s"} +{"loss": 0.3796834, "token_acc": 0.87460399, "grad_norm": 0.1808321, "learning_rate": 9.259e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349555, "epoch": 1.08585859, "global_step/max_steps": "215/990", "percentage": "21.72%", "elapsed_time": "10m 14s", "remaining_time": "36m 55s"} +{"loss": 0.35806198, "token_acc": 0.87960545, "grad_norm": 0.30925488, "learning_rate": 9.214e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350637, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "10m 27s", "remaining_time": "36m 34s"} +{"eval_loss": 0.49181956, "eval_token_acc": 0.76279528, "eval_runtime": 1.2455, "eval_samples_per_second": 3.212, "eval_steps_per_second": 3.212, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "10m 28s", "remaining_time": "36m 39s"} +{"loss": 0.48544436, "token_acc": 0.84440934, "grad_norm": 0.36294565, "learning_rate": 9.169e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349163, "epoch": 1.13636364, "global_step/max_steps": "225/990", "percentage": "22.73%", "elapsed_time": "10m 44s", "remaining_time": "36m 29s"} +{"loss": 0.39215381, "token_acc": 0.86883977, "grad_norm": 0.20928738, "learning_rate": 9.122e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.347175, "epoch": 1.16161616, "global_step/max_steps": "230/990", "percentage": "23.23%", "elapsed_time": "11m 2s", "remaining_time": "36m 27s"} +{"loss": 0.20322649, "token_acc": 0.90918316, "grad_norm": 0.24130744, "learning_rate": 9.074e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349366, "epoch": 1.18686869, "global_step/max_steps": "235/990", "percentage": "23.74%", "elapsed_time": "11m 12s", "remaining_time": "35m 59s"} +{"loss": 0.44439197, "token_acc": 0.85512654, "grad_norm": 0.40426677, "learning_rate": 9.025e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350626, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "11m 24s", "remaining_time": "35m 37s"} +{"eval_loss": 0.51390809, "eval_token_acc": 0.76279528, "eval_runtime": 1.2422, "eval_samples_per_second": 3.22, "eval_steps_per_second": 3.22, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "11m 25s", "remaining_time": "35m 41s"} +{"loss": 0.29034128, "token_acc": 0.85684279, "grad_norm": 0.25741172, "learning_rate": 8.975e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350706, "epoch": 1.23737374, "global_step/max_steps": "245/990", "percentage": "24.75%", "elapsed_time": "11m 38s", "remaining_time": "35m 23s"} +{"loss": 0.42200923, "token_acc": 0.85240244, "grad_norm": 0.49813297, "learning_rate": 8.924e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.352825, "epoch": 1.26262626, "global_step/max_steps": "250/990", "percentage": "25.25%", "elapsed_time": "11m 48s", "remaining_time": "34m 56s"} +{"loss": 0.379809, "token_acc": 0.85981137, "grad_norm": 0.25152785, "learning_rate": 8.872e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.351391, "epoch": 1.28787879, "global_step/max_steps": "255/990", "percentage": "25.76%", "elapsed_time": "12m 5s", "remaining_time": "34m 50s"} +{"loss": 0.3815913, "token_acc": 0.85433207, "grad_norm": 0.43190768, "learning_rate": 8.818e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.352512, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "12m 17s", "remaining_time": "34m 29s"} +{"eval_loss": 0.44263035, "eval_token_acc": 0.77066929, "eval_runtime": 1.2477, "eval_samples_per_second": 3.206, "eval_steps_per_second": 3.206, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "12m 18s", "remaining_time": "34m 33s"} +{"loss": 0.30707059, "token_acc": 0.89507048, "grad_norm": 0.1287196, "learning_rate": 8.764e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.348922, "epoch": 1.33838384, "global_step/max_steps": "265/990", "percentage": "26.77%", "elapsed_time": "12m 39s", "remaining_time": "34m 36s"} +{"loss": 0.28065162, "token_acc": 0.89539462, "grad_norm": 0.28978527, "learning_rate": 8.708e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350022, "epoch": 1.36363636, "global_step/max_steps": "270/990", "percentage": "27.27%", "elapsed_time": "12m 51s", "remaining_time": "34m 16s"} +{"loss": 0.29118433, "token_acc": 0.89552858, "grad_norm": 0.2160148, "learning_rate": 8.652e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.349742, "epoch": 1.38888889, "global_step/max_steps": "275/990", "percentage": "27.78%", "elapsed_time": "13m 5s", "remaining_time": "34m 3s"} +{"loss": 0.41789637, "token_acc": 0.86854701, "grad_norm": 0.58274782, "learning_rate": 8.594e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350778, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "13m 17s", "remaining_time": "33m 43s"} +{"eval_loss": 0.42635608, "eval_token_acc": 0.77362205, "eval_runtime": 1.244, "eval_samples_per_second": 3.215, "eval_steps_per_second": 3.215, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "13m 19s", "remaining_time": "33m 46s"} +{"loss": 0.23049119, "token_acc": 0.88961262, "grad_norm": 0.53205258, "learning_rate": 8.536e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350607, "epoch": 1.43939394, "global_step/max_steps": "285/990", "percentage": "28.79%", "elapsed_time": "13m 32s", "remaining_time": "33m 29s"} +{"loss": 0.36380806, "token_acc": 0.86199232, "grad_norm": 0.44731838, "learning_rate": 8.476e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352023, "epoch": 1.46464646, "global_step/max_steps": "290/990", "percentage": "29.29%", "elapsed_time": "13m 43s", "remaining_time": "33m 7s"} +{"loss": 0.39556937, "token_acc": 0.85105898, "grad_norm": 0.30866507, "learning_rate": 8.415e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353538, "epoch": 1.48989899, "global_step/max_steps": "295/990", "percentage": "29.80%", "elapsed_time": "13m 54s", "remaining_time": "32m 44s"} +{"loss": 0.3252337, "token_acc": 0.88556925, "grad_norm": 1.61200285, "learning_rate": 8.354e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353789, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "14m 7s", "remaining_time": "32m 29s"} +{"eval_loss": 0.45433885, "eval_token_acc": 0.77755906, "eval_runtime": 1.2577, "eval_samples_per_second": 3.18, "eval_steps_per_second": 3.18, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "14m 8s", "remaining_time": "32m 32s"} +{"loss": 0.3657505, "token_acc": 0.86810016, "grad_norm": 0.37553474, "learning_rate": 8.291e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.351379, "epoch": 1.54040404, "global_step/max_steps": "305/990", "percentage": "30.81%", "elapsed_time": "14m 27s", "remaining_time": "32m 28s"} +{"loss": 0.30150371, "token_acc": 0.90084617, "grad_norm": 0.33566633, "learning_rate": 8.228e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352623, "epoch": 1.56565657, "global_step/max_steps": "310/990", "percentage": "31.31%", "elapsed_time": "14m 38s", "remaining_time": "32m 7s"} +{"loss": 0.31590912, "token_acc": 0.87795276, "grad_norm": 0.8437044, "learning_rate": 8.164e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353918, "epoch": 1.59090909, "global_step/max_steps": "315/990", "percentage": "31.82%", "elapsed_time": "14m 49s", "remaining_time": "31m 46s"} +{"loss": 0.33113201, "token_acc": 0.90200889, "grad_norm": 0.38012376, "learning_rate": 8.099e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354333, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "15m 2s", "remaining_time": "31m 30s"} +{"eval_loss": 0.43398547, "eval_token_acc": 0.7726378, "eval_runtime": 1.2796, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "15m 4s", "remaining_time": "31m 32s"} +{"loss": 0.34933012, "token_acc": 0.85279739, "grad_norm": 0.37100571, "learning_rate": 8.033e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353766, "epoch": 1.64141414, "global_step/max_steps": "325/990", "percentage": "32.83%", "elapsed_time": "15m 18s", "remaining_time": "31m 19s"} +{"loss": 0.37286627, "token_acc": 0.88436919, "grad_norm": 0.58018267, "learning_rate": 7.966e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354876, "epoch": 1.66666667, "global_step/max_steps": "330/990", "percentage": "33.33%", "elapsed_time": "15m 29s", "remaining_time": "30m 59s"} +{"loss": 0.37944331, "token_acc": 0.87274744, "grad_norm": 0.3128258, "learning_rate": 7.898e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354891, "epoch": 1.69191919, "global_step/max_steps": "335/990", "percentage": "33.84%", "elapsed_time": "15m 43s", "remaining_time": "30m 44s"} +{"loss": 0.26272769, "token_acc": 0.90750436, "grad_norm": 0.81518579, "learning_rate": 7.83e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355958, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "15m 54s", "remaining_time": "30m 25s"} +{"eval_loss": 0.37076545, "eval_token_acc": 0.77559055, "eval_runtime": 1.2578, "eval_samples_per_second": 3.18, "eval_steps_per_second": 3.18, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "15m 56s", "remaining_time": "30m 27s"} +{"loss": 0.28121564, "token_acc": 0.89513307, "grad_norm": 0.324294, "learning_rate": 7.76e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354629, "epoch": 1.74242424, "global_step/max_steps": "345/990", "percentage": "34.85%", "elapsed_time": "16m 12s", "remaining_time": "30m 18s"} +{"loss": 0.20163231, "token_acc": 0.92025157, "grad_norm": 1.06142664, "learning_rate": 7.69e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355616, "epoch": 1.76767677, "global_step/max_steps": "350/990", "percentage": "35.35%", "elapsed_time": "16m 23s", "remaining_time": "29m 59s"} +{"loss": 0.47138438, "token_acc": 0.8495345, "grad_norm": 0.62606174, "learning_rate": 7.62e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355891, "epoch": 1.79292929, "global_step/max_steps": "355/990", "percentage": "35.86%", "elapsed_time": "16m 37s", "remaining_time": "29m 43s"} +{"loss": 0.25896482, "token_acc": 0.89644719, "grad_norm": 0.2624684, "learning_rate": 7.548e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356199, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "16m 50s", "remaining_time": "29m 28s"} +{"eval_loss": 0.39613855, "eval_token_acc": 0.7765748, "eval_runtime": 1.2624, "eval_samples_per_second": 3.168, "eval_steps_per_second": 3.168, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "16m 51s", "remaining_time": "29m 30s"} +{"loss": 0.38772385, "token_acc": 0.85438187, "grad_norm": 0.71769845, "learning_rate": 7.476e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356015, "epoch": 1.84343434, "global_step/max_steps": "365/990", "percentage": "36.87%", "elapsed_time": "17m 4s", "remaining_time": "29m 14s"} +{"loss": 0.34966278, "token_acc": 0.87574038, "grad_norm": 0.27225521, "learning_rate": 7.403e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356009, "epoch": 1.86868687, "global_step/max_steps": "370/990", "percentage": "37.37%", "elapsed_time": "17m 18s", "remaining_time": "29m 0s"} +{"loss": 0.40628734, "token_acc": 0.86587236, "grad_norm": 0.41945073, "learning_rate": 7.329e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355662, "epoch": 1.89393939, "global_step/max_steps": "375/990", "percentage": "37.88%", "elapsed_time": "17m 34s", "remaining_time": "28m 48s"} +{"loss": 0.35225565, "token_acc": 0.88881253, "grad_norm": 0.60451317, "learning_rate": 7.255e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356078, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "17m 46s", "remaining_time": "28m 32s"} +{"eval_loss": 0.38526344, "eval_token_acc": 0.7726378, "eval_runtime": 1.2874, "eval_samples_per_second": 3.107, "eval_steps_per_second": 3.107, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "17m 48s", "remaining_time": "28m 34s"} +{"loss": 0.37585311, "token_acc": 0.88160314, "grad_norm": 0.16466928, "learning_rate": 7.18e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354701, "epoch": 1.94444444, "global_step/max_steps": "385/990", "percentage": "38.89%", "elapsed_time": "18m 5s", "remaining_time": "28m 25s"} +{"loss": 0.25691216, "token_acc": 0.89950558, "grad_norm": 0.32463047, "learning_rate": 7.105e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354307, "epoch": 1.96969697, "global_step/max_steps": "390/990", "percentage": "39.39%", "elapsed_time": "18m 20s", "remaining_time": "28m 12s"} +{"loss": 0.25709858, "token_acc": 0.89952798, "grad_norm": 0.30748469, "learning_rate": 7.029e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35515, "epoch": 1.99494949, "global_step/max_steps": "395/990", "percentage": "39.90%", "elapsed_time": "18m 31s", "remaining_time": "27m 54s"} +{"loss": 0.22938874, "token_acc": 0.94195168, "grad_norm": 0.56834871, "learning_rate": 6.952e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356409, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "18m 41s", "remaining_time": "27m 34s"} +{"eval_loss": 0.36361957, "eval_token_acc": 0.76771654, "eval_runtime": 1.2559, "eval_samples_per_second": 3.185, "eval_steps_per_second": 3.185, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "18m 43s", "remaining_time": "27m 36s"} +{"loss": 0.15864257, "token_acc": 0.90921465, "grad_norm": 0.6048702, "learning_rate": 6.875e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355742, "epoch": 2.04545455, "global_step/max_steps": "405/990", "percentage": "40.91%", "elapsed_time": "18m 58s", "remaining_time": "27m 23s"} +{"loss": 0.32331791, "token_acc": 0.91766159, "grad_norm": 0.70897079, "learning_rate": 6.797e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355669, "epoch": 2.07070707, "global_step/max_steps": "410/990", "percentage": "41.41%", "elapsed_time": "19m 12s", "remaining_time": "27m 10s"} +{"loss": 0.19509116, "token_acc": 0.924197, "grad_norm": 0.37055758, "learning_rate": 6.719e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356398, "epoch": 2.0959596, "global_step/max_steps": "415/990", "percentage": "41.92%", "elapsed_time": "19m 24s", "remaining_time": "26m 52s"} +{"loss": 0.09936256, "token_acc": 0.97148676, "grad_norm": 0.28154191, "learning_rate": 6.64e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35724, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "19m 35s", "remaining_time": "26m 35s"} +{"eval_loss": 0.37009484, "eval_token_acc": 0.77362205, "eval_runtime": 1.2502, "eval_samples_per_second": 3.199, "eval_steps_per_second": 3.199, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "19m 36s", "remaining_time": "26m 36s"} +{"loss": 0.18864726, "token_acc": 0.89312755, "grad_norm": 0.61892229, "learning_rate": 6.561e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357016, "epoch": 2.14646465, "global_step/max_steps": "425/990", "percentage": "42.93%", "elapsed_time": "19m 50s", "remaining_time": "26m 22s"} +{"loss": 0.13603065, "token_acc": 0.95940751, "grad_norm": 0.47222099, "learning_rate": 6.481e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356484, "epoch": 2.17171717, "global_step/max_steps": "430/990", "percentage": "43.43%", "elapsed_time": "20m 5s", "remaining_time": "26m 10s"} +{"loss": 0.18846033, "token_acc": 0.94275395, "grad_norm": 0.25799301, "learning_rate": 6.401e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355717, "epoch": 2.1969697, "global_step/max_steps": "435/990", "percentage": "43.94%", "elapsed_time": "20m 22s", "remaining_time": "25m 59s"} +{"loss": 0.21381202, "token_acc": 0.92740741, "grad_norm": 0.84787679, "learning_rate": 6.321e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356335, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "20m 34s", "remaining_time": "25m 43s"} +{"eval_loss": 0.37863982, "eval_token_acc": 0.77362205, "eval_runtime": 1.236, "eval_samples_per_second": 3.236, "eval_steps_per_second": 3.236, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "20m 35s", "remaining_time": "25m 44s"} +{"loss": 0.12016981, "token_acc": 0.92939536, "grad_norm": 0.3824341, "learning_rate": 6.24e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355702, "epoch": 2.24747475, "global_step/max_steps": "445/990", "percentage": "44.95%", "elapsed_time": "20m 50s", "remaining_time": "25m 31s"} +{"loss": 0.26917288, "token_acc": 0.90533755, "grad_norm": 0.71722466, "learning_rate": 6.159e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356304, "epoch": 2.27272727, "global_step/max_steps": "450/990", "percentage": "45.45%", "elapsed_time": "21m 2s", "remaining_time": "25m 15s"} +{"loss": 0.18497173, "token_acc": 0.93376289, "grad_norm": 0.24402711, "learning_rate": 6.078e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356536, "epoch": 2.2979798, "global_step/max_steps": "455/990", "percentage": "45.96%", "elapsed_time": "21m 15s", "remaining_time": "25m 0s"} +{"loss": 0.19234394, "token_acc": 0.93608216, "grad_norm": 0.24683055, "learning_rate": 5.996e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356885, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "21m 28s", "remaining_time": "24m 44s"} +{"eval_loss": 0.36144415, "eval_token_acc": 0.77165354, "eval_runtime": 1.2634, "eval_samples_per_second": 3.166, "eval_steps_per_second": 3.166, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "21m 29s", "remaining_time": "24m 46s"} +{"loss": 0.17323194, "token_acc": 0.89190939, "grad_norm": 0.48954588, "learning_rate": 5.914e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357128, "epoch": 2.34848485, "global_step/max_steps": "465/990", "percentage": "46.97%", "elapsed_time": "21m 41s", "remaining_time": "24m 29s"} +{"loss": 0.13749444, "token_acc": 0.94342195, "grad_norm": 0.55840182, "learning_rate": 5.832e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35673, "epoch": 2.37373737, "global_step/max_steps": "470/990", "percentage": "47.47%", "elapsed_time": "21m 57s", "remaining_time": "24m 17s"} +{"loss": 0.20462332, "token_acc": 0.92272379, "grad_norm": 1.30183637, "learning_rate": 5.749e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357258, "epoch": 2.3989899, "global_step/max_steps": "475/990", "percentage": "47.98%", "elapsed_time": "22m 9s", "remaining_time": "24m 1s"} +{"loss": 0.20529029, "token_acc": 0.92612733, "grad_norm": 0.33143967, "learning_rate": 5.666e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356138, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "22m 27s", "remaining_time": "23m 51s"} +{"eval_loss": 0.36717469, "eval_token_acc": 0.76181102, "eval_runtime": 1.2559, "eval_samples_per_second": 3.185, "eval_steps_per_second": 3.185, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "22m 28s", "remaining_time": "23m 52s"} +{"loss": 0.14841547, "token_acc": 0.89708211, "grad_norm": 0.63942325, "learning_rate": 5.584e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356392, "epoch": 2.44949495, "global_step/max_steps": "485/990", "percentage": "48.99%", "elapsed_time": "22m 40s", "remaining_time": "23m 36s"} +{"loss": 0.28578765, "token_acc": 0.89564867, "grad_norm": 0.55023944, "learning_rate": 5.5e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356615, "epoch": 2.47474747, "global_step/max_steps": "490/990", "percentage": "49.49%", "elapsed_time": "22m 53s", "remaining_time": "23m 21s"} +{"loss": 0.15979899, "token_acc": 0.93821839, "grad_norm": 0.27991933, "learning_rate": 5.417e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357352, "epoch": 2.5, "global_step/max_steps": "495/990", "percentage": "50.00%", "elapsed_time": "23m 4s", "remaining_time": "23m 4s"} +{"loss": 0.23016484, "token_acc": 0.89288353, "grad_norm": 0.47879472, "learning_rate": 5.334e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357166, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "23m 19s", "remaining_time": "22m 51s"} +{"eval_loss": 0.36765411, "eval_token_acc": 0.77066929, "eval_runtime": 1.2648, "eval_samples_per_second": 3.162, "eval_steps_per_second": 3.162, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "23m 20s", "remaining_time": "22m 52s"} +{"loss": 0.15926197, "token_acc": 0.90470707, "grad_norm": 0.56265652, "learning_rate": 5.251e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356766, "epoch": 2.55050505, "global_step/max_steps": "505/990", "percentage": "51.01%", "elapsed_time": "23m 35s", "remaining_time": "22m 39s"} +{"loss": 0.15416876, "token_acc": 0.93856581, "grad_norm": 0.72930646, "learning_rate": 5.167e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357626, "epoch": 2.57575758, "global_step/max_steps": "510/990", "percentage": "51.52%", "elapsed_time": "23m 45s", "remaining_time": "22m 21s"} +{"loss": 0.1874794, "token_acc": 0.92710425, "grad_norm": 0.32495785, "learning_rate": 5.084e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.358064, "epoch": 2.6010101, "global_step/max_steps": "515/990", "percentage": "52.02%", "elapsed_time": "23m 57s", "remaining_time": "22m 6s"} +{"loss": 0.19285821, "token_acc": 0.91820684, "grad_norm": 0.37267098, "learning_rate": 5e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.358116, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "24m 11s", "remaining_time": "21m 52s"} +{"eval_loss": 0.35743266, "eval_token_acc": 0.76870079, "eval_runtime": 1.2997, "eval_samples_per_second": 3.078, "eval_steps_per_second": 3.078, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "24m 12s", "remaining_time": "21m 53s"} +{"loss": 0.16548975, "token_acc": 0.91607419, "grad_norm": 0.41294599, "learning_rate": 4.916e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357945, "epoch": 2.65151515, "global_step/max_steps": "525/990", "percentage": "53.03%", "elapsed_time": "24m 26s", "remaining_time": "21m 38s"} +{"loss": 0.22736096, "token_acc": 0.91092316, "grad_norm": 0.35475859, "learning_rate": 4.833e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357177, "epoch": 2.67676768, "global_step/max_steps": "530/990", "percentage": "53.54%", "elapsed_time": "24m 43s", "remaining_time": "21m 27s"} +{"loss": 0.16493123, "token_acc": 0.94582564, "grad_norm": 0.76286024, "learning_rate": 4.749e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356557, "epoch": 2.7020202, "global_step/max_steps": "535/990", "percentage": "54.04%", "elapsed_time": "25m 0s", "remaining_time": "21m 15s"} +{"loss": 0.29071348, "token_acc": 0.89696033, "grad_norm": 0.97341543, "learning_rate": 4.666e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356129, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "25m 15s", "remaining_time": "21m 3s"} +{"eval_loss": 0.3351669, "eval_token_acc": 0.77165354, "eval_runtime": 1.2565, "eval_samples_per_second": 3.183, "eval_steps_per_second": 3.183, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "25m 17s", "remaining_time": "21m 4s"} +{"loss": 0.22140293, "token_acc": 0.88909534, "grad_norm": 0.56667453, "learning_rate": 4.583e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355557, "epoch": 2.75252525, "global_step/max_steps": "545/990", "percentage": "55.05%", "elapsed_time": "25m 32s", "remaining_time": "20m 51s"} +{"loss": 0.17021152, "token_acc": 0.93102423, "grad_norm": 0.42045528, "learning_rate": 4.5e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355665, "epoch": 2.77777778, "global_step/max_steps": "550/990", "percentage": "55.56%", "elapsed_time": "25m 46s", "remaining_time": "20m 36s"} +{"loss": 0.18070143, "token_acc": 0.93980033, "grad_norm": 0.66016507, "learning_rate": 4.416e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356126, "epoch": 2.8030303, "global_step/max_steps": "555/990", "percentage": "56.06%", "elapsed_time": "25m 58s", "remaining_time": "20m 21s"} +{"loss": 0.15745289, "token_acc": 0.92938057, "grad_norm": 0.25845787, "learning_rate": 4.334e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356108, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "26m 12s", "remaining_time": "20m 7s"} +{"eval_loss": 0.33242106, "eval_token_acc": 0.76968504, "eval_runtime": 1.2648, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "26m 13s", "remaining_time": "20m 8s"} +{"loss": 0.20625925, "token_acc": 0.90468103, "grad_norm": 0.71159899, "learning_rate": 4.251e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355539, "epoch": 2.85353535, "global_step/max_steps": "565/990", "percentage": "57.07%", "elapsed_time": "26m 28s", "remaining_time": "19m 55s"} +{"loss": 0.25417945, "token_acc": 0.90460157, "grad_norm": 0.60344523, "learning_rate": 4.168e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355222, "epoch": 2.87878788, "global_step/max_steps": "570/990", "percentage": "57.58%", "elapsed_time": "26m 44s", "remaining_time": "19m 42s"} +{"loss": 0.17194978, "token_acc": 0.93406737, "grad_norm": 0.90261829, "learning_rate": 4.086e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35529, "epoch": 2.9040404, "global_step/max_steps": "575/990", "percentage": "58.08%", "elapsed_time": "26m 58s", "remaining_time": "19m 27s"} +{"loss": 0.25096388, "token_acc": 0.89321732, "grad_norm": 2.0807786, "learning_rate": 4.004e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355904, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "27m 9s", "remaining_time": "19m 11s"} +{"eval_loss": 0.34411764, "eval_token_acc": 0.76968504, "eval_runtime": 1.2647, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "27m 10s", "remaining_time": "19m 12s"} +{"loss": 0.21597733, "token_acc": 0.88901646, "grad_norm": 0.81269169, "learning_rate": 3.922e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355876, "epoch": 2.95454545, "global_step/max_steps": "585/990", "percentage": "59.09%", "elapsed_time": "27m 23s", "remaining_time": "18m 57s"} +{"loss": 0.2014812, "token_acc": 0.93326271, "grad_norm": 0.70208395, "learning_rate": 3.841e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355335, "epoch": 2.97979798, "global_step/max_steps": "590/990", "percentage": "59.60%", "elapsed_time": "27m 40s", "remaining_time": "18m 45s"} +{"loss": 0.2256376, "token_acc": 0.9326572, "grad_norm": 0.25534526, "learning_rate": 3.76e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354817, "epoch": 3.00505051, "global_step/max_steps": "595/990", "percentage": "60.10%", "elapsed_time": "27m 56s", "remaining_time": "18m 33s"} +{"loss": 0.13529282, "token_acc": 0.95718525, "grad_norm": 0.83795023, "learning_rate": 3.679e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355006, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "28m 9s", "remaining_time": "18m 18s"} +{"eval_loss": 0.30427584, "eval_token_acc": 0.77066929, "eval_runtime": 1.2499, "eval_samples_per_second": 3.2, "eval_steps_per_second": 3.2, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "28m 11s", "remaining_time": "18m 19s"} +{"loss": 0.09742945, "token_acc": 0.93990385, "grad_norm": 0.57407409, "learning_rate": 3.599e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353865, "epoch": 3.05555556, "global_step/max_steps": "605/990", "percentage": "61.11%", "elapsed_time": "28m 29s", "remaining_time": "18m 7s"} +{"loss": 0.05046467, "token_acc": 0.97425664, "grad_norm": 0.72763276, "learning_rate": 3.519e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354532, "epoch": 3.08080808, "global_step/max_steps": "610/990", "percentage": "61.62%", "elapsed_time": "28m 40s", "remaining_time": "17m 51s"} +{"loss": 0.03918294, "token_acc": 0.98925934, "grad_norm": 0.16198806, "learning_rate": 3.439e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355134, "epoch": 3.10606061, "global_step/max_steps": "615/990", "percentage": "62.12%", "elapsed_time": "28m 51s", "remaining_time": "17m 35s"} +{"loss": 0.0853424, "token_acc": 0.97041223, "grad_norm": 0.42892238, "learning_rate": 3.36e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355115, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "29m 5s", "remaining_time": "17m 21s"} +{"eval_loss": 0.30796593, "eval_token_acc": 0.76574803, "eval_runtime": 1.2475, "eval_samples_per_second": 3.207, "eval_steps_per_second": 3.207, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "29m 6s", "remaining_time": "17m 22s"} +{"loss": 0.09771489, "token_acc": 0.94831328, "grad_norm": 0.40636337, "learning_rate": 3.281e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354176, "epoch": 3.15656566, "global_step/max_steps": "625/990", "percentage": "63.13%", "elapsed_time": "29m 24s", "remaining_time": "17m 10s"} +{"loss": 0.06919086, "token_acc": 0.96818839, "grad_norm": 0.74913073, "learning_rate": 3.203e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354714, "epoch": 3.18181818, "global_step/max_steps": "630/990", "percentage": "63.64%", "elapsed_time": "29m 35s", "remaining_time": "16m 54s"} +{"loss": 0.13240322, "token_acc": 0.94445062, "grad_norm": 0.57841122, "learning_rate": 3.125e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354589, "epoch": 3.20707071, "global_step/max_steps": "635/990", "percentage": "64.14%", "elapsed_time": "29m 50s", "remaining_time": "16m 40s"} +{"loss": 0.11127981, "token_acc": 0.94948735, "grad_norm": 0.44068816, "learning_rate": 3.048e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353946, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "30m 7s", "remaining_time": "16m 28s"} +{"eval_loss": 0.30791736, "eval_token_acc": 0.77165354, "eval_runtime": 1.2574, "eval_samples_per_second": 3.181, "eval_steps_per_second": 3.181, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "30m 9s", "remaining_time": "16m 29s"} +{"loss": 0.08342495, "token_acc": 0.94528481, "grad_norm": 0.4303298, "learning_rate": 2.971e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352926, "epoch": 3.25757576, "global_step/max_steps": "645/990", "percentage": "65.15%", "elapsed_time": "30m 27s", "remaining_time": "16m 17s"} +{"loss": 0.12928666, "token_acc": 0.95302239, "grad_norm": 0.26030338, "learning_rate": 2.895e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353396, "epoch": 3.28282828, "global_step/max_steps": "650/990", "percentage": "65.66%", "elapsed_time": "30m 38s", "remaining_time": "16m 1s"} +{"loss": 0.09613171, "token_acc": 0.96914425, "grad_norm": 0.49660748, "learning_rate": 2.82e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353561, "epoch": 3.30808081, "global_step/max_steps": "655/990", "percentage": "66.16%", "elapsed_time": "30m 52s", "remaining_time": "15m 47s"} +{"loss": 0.09578182, "token_acc": 0.96340326, "grad_norm": 0.28682232, "learning_rate": 2.745e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353616, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "31m 6s", "remaining_time": "15m 33s"} +{"eval_loss": 0.30913258, "eval_token_acc": 0.77066929, "eval_runtime": 1.251, "eval_samples_per_second": 3.197, "eval_steps_per_second": 3.197, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "31m 7s", "remaining_time": "15m 33s"} +{"loss": 0.09090739, "token_acc": 0.92610562, "grad_norm": 0.81727451, "learning_rate": 2.671e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353647, "epoch": 3.35858586, "global_step/max_steps": "665/990", "percentage": "67.17%", "elapsed_time": "31m 20s", "remaining_time": "15m 18s"} +{"loss": 0.10645673, "token_acc": 0.95919648, "grad_norm": 0.81599212, "learning_rate": 2.597e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353871, "epoch": 3.38383838, "global_step/max_steps": "670/990", "percentage": "67.68%", "elapsed_time": "31m 32s", "remaining_time": "15m 4s"} +{"loss": 0.07736409, "token_acc": 0.96231985, "grad_norm": 0.35854483, "learning_rate": 2.524e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354188, "epoch": 3.40909091, "global_step/max_steps": "675/990", "percentage": "68.18%", "elapsed_time": "31m 45s", "remaining_time": "14m 49s"} +{"loss": 0.1233693, "token_acc": 0.94138418, "grad_norm": 0.63011169, "learning_rate": 2.452e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354564, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "31m 57s", "remaining_time": "14m 34s"} +{"eval_loss": 0.30977443, "eval_token_acc": 0.76476378, "eval_runtime": 1.3015, "eval_samples_per_second": 3.073, "eval_steps_per_second": 3.073, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "31m 58s", "remaining_time": "14m 34s"} +{"loss": 0.11545861, "token_acc": 0.91397168, "grad_norm": 0.27891296, "learning_rate": 2.38e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353786, "epoch": 3.45959596, "global_step/max_steps": "685/990", "percentage": "69.19%", "elapsed_time": "32m 15s", "remaining_time": "14m 21s"} +{"loss": 0.12172416, "token_acc": 0.95246123, "grad_norm": 0.71830851, "learning_rate": 2.31e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353548, "epoch": 3.48484848, "global_step/max_steps": "690/990", "percentage": "69.70%", "elapsed_time": "32m 31s", "remaining_time": "14m 8s"} +{"loss": 0.11445937, "token_acc": 0.96172323, "grad_norm": 0.39089629, "learning_rate": 2.24e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353948, "epoch": 3.51010101, "global_step/max_steps": "695/990", "percentage": "70.20%", "elapsed_time": "32m 43s", "remaining_time": "13m 53s"} +{"loss": 0.03956202, "token_acc": 0.98725275, "grad_norm": 0.22522901, "learning_rate": 2.17e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354517, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "32m 54s", "remaining_time": "13m 37s"} +{"eval_loss": 0.30946803, "eval_token_acc": 0.76771654, "eval_runtime": 1.3167, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "32m 55s", "remaining_time": "13m 38s"} +{"loss": 0.07134842, "token_acc": 0.93287248, "grad_norm": 0.40321338, "learning_rate": 2.102e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354512, "epoch": 3.56060606, "global_step/max_steps": "705/990", "percentage": "71.21%", "elapsed_time": "33m 8s", "remaining_time": "13m 23s"} +{"loss": 0.09319761, "token_acc": 0.96474158, "grad_norm": 0.56171137, "learning_rate": 2.034e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354354, "epoch": 3.58585859, "global_step/max_steps": "710/990", "percentage": "71.72%", "elapsed_time": "33m 23s", "remaining_time": "13m 10s"} +{"loss": 0.00758616, "token_acc": 0.99701238, "grad_norm": 0.03697615, "learning_rate": 1.967e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35527, "epoch": 3.61111111, "global_step/max_steps": "715/990", "percentage": "72.22%", "elapsed_time": "33m 32s", "remaining_time": "12m 53s"} +{"loss": 0.05711253, "token_acc": 0.98036782, "grad_norm": 0.63094509, "learning_rate": 1.901e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355088, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "33m 47s", "remaining_time": "12m 40s"} +{"eval_loss": 0.31437123, "eval_token_acc": 0.76476378, "eval_runtime": 1.27, "eval_samples_per_second": 3.15, "eval_steps_per_second": 3.15, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "33m 48s", "remaining_time": "12m 40s"} +{"loss": 0.10505948, "token_acc": 0.93047059, "grad_norm": 0.58260226, "learning_rate": 1.836e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354673, "epoch": 3.66161616, "global_step/max_steps": "725/990", "percentage": "73.23%", "elapsed_time": "34m 3s", "remaining_time": "12m 27s"} +{"loss": 0.11193918, "token_acc": 0.95552943, "grad_norm": 0.31852394, "learning_rate": 1.772e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354365, "epoch": 3.68686869, "global_step/max_steps": "730/990", "percentage": "73.74%", "elapsed_time": "34m 19s", "remaining_time": "12m 13s"} +{"loss": 0.09634423, "token_acc": 0.95890754, "grad_norm": 0.55090433, "learning_rate": 1.709e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354253, "epoch": 3.71212121, "global_step/max_steps": "735/990", "percentage": "74.24%", "elapsed_time": "34m 34s", "remaining_time": "11m 59s"} +{"loss": 0.12444987, "token_acc": 0.96185706, "grad_norm": 0.71475935, "learning_rate": 1.646e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354721, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "34m 45s", "remaining_time": "11m 44s"} +{"eval_loss": 0.31311998, "eval_token_acc": 0.76673228, "eval_runtime": 1.2723, "eval_samples_per_second": 3.144, "eval_steps_per_second": 3.144, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "34m 47s", "remaining_time": "11m 45s"} +{"loss": 0.06069818, "token_acc": 0.91298491, "grad_norm": 0.47293511, "learning_rate": 1.585e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354907, "epoch": 3.76262626, "global_step/max_steps": "745/990", "percentage": "75.25%", "elapsed_time": "34m 58s", "remaining_time": "11m 30s"} +{"loss": 0.0420902, "token_acc": 0.98313139, "grad_norm": 0.76170671, "learning_rate": 1.524e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355507, "epoch": 3.78787879, "global_step/max_steps": "750/990", "percentage": "75.76%", "elapsed_time": "35m 9s", "remaining_time": "11m 14s"} +{"loss": 0.18900135, "token_acc": 0.92085258, "grad_norm": 0.46223778, "learning_rate": 1.464e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354408, "epoch": 3.81313131, "global_step/max_steps": "755/990", "percentage": "76.26%", "elapsed_time": "35m 29s", "remaining_time": "11m 2s"} +{"loss": 0.0198194, "token_acc": 0.99390244, "grad_norm": 0.48695704, "learning_rate": 1.406e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355135, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "35m 39s", "remaining_time": "10m 47s"} +{"eval_loss": 0.31260797, "eval_token_acc": 0.76574803, "eval_runtime": 1.248, "eval_samples_per_second": 3.205, "eval_steps_per_second": 3.205, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "35m 40s", "remaining_time": "10m 47s"} +{"loss": 0.11093479, "token_acc": 0.92044417, "grad_norm": 0.61361539, "learning_rate": 1.348e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354973, "epoch": 3.86363636, "global_step/max_steps": "765/990", "percentage": "77.27%", "elapsed_time": "35m 54s", "remaining_time": "10m 33s"} +{"loss": 0.06602247, "token_acc": 0.9785434, "grad_norm": 0.24277838, "learning_rate": 1.292e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355116, "epoch": 3.88888889, "global_step/max_steps": "770/990", "percentage": "77.78%", "elapsed_time": "36m 7s", "remaining_time": "10m 19s"} +{"loss": 0.11602372, "token_acc": 0.97355297, "grad_norm": 0.57452285, "learning_rate": 1.236e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35515, "epoch": 3.91414141, "global_step/max_steps": "775/990", "percentage": "78.28%", "elapsed_time": "36m 21s", "remaining_time": "10m 5s"} +{"loss": 0.09153335, "token_acc": 0.95584989, "grad_norm": 0.40915063, "learning_rate": 1.182e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355102, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "36m 36s", "remaining_time": "9m 51s"} +{"eval_loss": 0.31799242, "eval_token_acc": 0.76574803, "eval_runtime": 1.2772, "eval_samples_per_second": 3.132, "eval_steps_per_second": 3.132, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "36m 37s", "remaining_time": "9m 51s"} +{"loss": 0.07547231, "token_acc": 0.92116358, "grad_norm": 1.52997875, "learning_rate": 1.128e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355, "epoch": 3.96464646, "global_step/max_steps": "785/990", "percentage": "79.29%", "elapsed_time": "36m 50s", "remaining_time": "9m 37s"} +{"loss": 0.10159326, "token_acc": 0.96628656, "grad_norm": 0.80330598, "learning_rate": 1.076e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35536, "epoch": 3.98989899, "global_step/max_steps": "790/990", "percentage": "79.80%", "elapsed_time": "37m 2s", "remaining_time": "9m 22s"} +{"loss": 0.08443623, "token_acc": 0.97214833, "grad_norm": 0.47934148, "learning_rate": 1.025e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355607, "epoch": 4.01515152, "global_step/max_steps": "795/990", "percentage": "80.30%", "elapsed_time": "37m 15s", "remaining_time": "9m 8s"} +{"loss": 0.03599932, "token_acc": 0.98787152, "grad_norm": 0.34890071, "learning_rate": 9.75e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355646, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "37m 29s", "remaining_time": "8m 54s"} +{"eval_loss": 0.32353964, "eval_token_acc": 0.76476378, "eval_runtime": 1.2641, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "37m 30s", "remaining_time": "8m 54s"} +{"loss": 0.06672016, "token_acc": 0.9496715, "grad_norm": 0.67824471, "learning_rate": 9.26e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355197, "epoch": 4.06565657, "global_step/max_steps": "805/990", "percentage": "81.31%", "elapsed_time": "37m 45s", "remaining_time": "8m 40s"} +{"loss": 0.01867678, "token_acc": 0.99373457, "grad_norm": 0.0298963, "learning_rate": 8.78e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355631, "epoch": 4.09090909, "global_step/max_steps": "810/990", "percentage": "81.82%", "elapsed_time": "37m 57s", "remaining_time": "8m 26s"} +{"loss": 0.04667163, "token_acc": 0.9823115, "grad_norm": 0.47457507, "learning_rate": 8.31e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355942, "epoch": 4.11616162, "global_step/max_steps": "815/990", "percentage": "82.32%", "elapsed_time": "38m 9s", "remaining_time": "8m 11s"} +{"loss": 0.03511995, "token_acc": 0.98811116, "grad_norm": 0.23747724, "learning_rate": 7.86e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355854, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "38m 23s", "remaining_time": "7m 57s"} +{"eval_loss": 0.32958165, "eval_token_acc": 0.76082677, "eval_runtime": 1.2496, "eval_samples_per_second": 3.201, "eval_steps_per_second": 3.201, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "38m 25s", "remaining_time": "7m 57s"} +{"loss": 0.06080627, "token_acc": 0.95607097, "grad_norm": 0.46623966, "learning_rate": 7.41e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355301, "epoch": 4.16666667, "global_step/max_steps": "825/990", "percentage": "83.33%", "elapsed_time": "38m 41s", "remaining_time": "7m 44s"} +{"loss": 0.0752157, "token_acc": 0.94310469, "grad_norm": 0.31738645, "learning_rate": 6.98e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355358, "epoch": 4.19191919, "global_step/max_steps": "830/990", "percentage": "83.84%", "elapsed_time": "38m 55s", "remaining_time": "7m 30s"} +{"loss": 0.10097865, "token_acc": 0.95639272, "grad_norm": 0.64921135, "learning_rate": 6.56e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355449, "epoch": 4.21717172, "global_step/max_steps": "835/990", "percentage": "84.34%", "elapsed_time": "39m 8s", "remaining_time": "7m 16s"} +{"loss": 0.07692533, "token_acc": 0.96721136, "grad_norm": 0.4551596, "learning_rate": 6.15e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355338, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "39m 23s", "remaining_time": "7m 2s"} +{"eval_loss": 0.33560345, "eval_token_acc": 0.76279528, "eval_runtime": 1.2455, "eval_samples_per_second": 3.211, "eval_steps_per_second": 3.211, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "39m 24s", "remaining_time": "7m 2s"} +{"loss": 0.14583763, "token_acc": 0.92345539, "grad_norm": 0.84573573, "learning_rate": 5.76e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354526, "epoch": 4.26767677, "global_step/max_steps": "845/990", "percentage": "85.35%", "elapsed_time": "39m 43s", "remaining_time": "6m 48s"} +{"loss": 0.10489172, "token_acc": 0.96072294, "grad_norm": 0.2833046, "learning_rate": 5.37e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354312, "epoch": 4.29292929, "global_step/max_steps": "850/990", "percentage": "85.86%", "elapsed_time": "39m 58s", "remaining_time": "6m 35s"} +{"loss": 0.03666624, "token_acc": 0.98607939, "grad_norm": 0.39615917, "learning_rate": 5e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354264, "epoch": 4.31818182, "global_step/max_steps": "855/990", "percentage": "86.36%", "elapsed_time": "40m 13s", "remaining_time": "6m 21s"} +{"loss": 0.03459436, "token_acc": 0.98748835, "grad_norm": 0.20940015, "learning_rate": 4.65e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354279, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "40m 27s", "remaining_time": "6m 6s"} +{"eval_loss": 0.33478868, "eval_token_acc": 0.76082677, "eval_runtime": 1.2407, "eval_samples_per_second": 3.224, "eval_steps_per_second": 3.224, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "40m 28s", "remaining_time": "6m 7s"} +{"loss": 0.04259861, "token_acc": 0.95549079, "grad_norm": 0.45534173, "learning_rate": 4.3e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354035, "epoch": 4.36868687, "global_step/max_steps": "865/990", "percentage": "87.37%", "elapsed_time": "40m 42s", "remaining_time": "5m 53s"} +{"loss": 0.04259796, "token_acc": 0.98844205, "grad_norm": 0.62042683, "learning_rate": 3.97e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354313, "epoch": 4.39393939, "global_step/max_steps": "870/990", "percentage": "87.88%", "elapsed_time": "40m 55s", "remaining_time": "5m 38s"} +{"loss": 0.02417032, "token_acc": 0.98577929, "grad_norm": 0.4031814, "learning_rate": 3.65e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354708, "epoch": 4.41919192, "global_step/max_steps": "875/990", "percentage": "88.38%", "elapsed_time": "41m 6s", "remaining_time": "5m 24s"} +{"loss": 0.05446075, "token_acc": 0.97495154, "grad_norm": 0.62169915, "learning_rate": 3.34e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354846, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "41m 19s", "remaining_time": "5m 9s"} +{"eval_loss": 0.33430994, "eval_token_acc": 0.76082677, "eval_runtime": 1.2375, "eval_samples_per_second": 3.232, "eval_steps_per_second": 3.232, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "41m 20s", "remaining_time": "5m 10s"} +{"loss": 0.03617172, "token_acc": 0.95993991, "grad_norm": 0.32584879, "learning_rate": 3.05e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354631, "epoch": 4.46969697, "global_step/max_steps": "885/990", "percentage": "89.39%", "elapsed_time": "41m 35s", "remaining_time": "4m 56s"} +{"loss": 0.0556745, "token_acc": 0.98252743, "grad_norm": 0.31160802, "learning_rate": 2.77e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354558, "epoch": 4.49494949, "global_step/max_steps": "890/990", "percentage": "89.90%", "elapsed_time": "41m 49s", "remaining_time": "4m 42s"} +{"loss": 0.02590339, "token_acc": 0.98566455, "grad_norm": 0.45844111, "learning_rate": 2.5e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354943, "epoch": 4.52020202, "global_step/max_steps": "895/990", "percentage": "90.40%", "elapsed_time": "42m 1s", "remaining_time": "4m 27s"} +{"loss": 0.11551049, "token_acc": 0.95717965, "grad_norm": 0.60481924, "learning_rate": 2.24e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354798, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "42m 16s", "remaining_time": "4m 13s"} +{"eval_loss": 0.33555707, "eval_token_acc": 0.76181102, "eval_runtime": 1.2587, "eval_samples_per_second": 3.178, "eval_steps_per_second": 3.178, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "42m 17s", "remaining_time": "4m 13s"} +{"loss": 0.00914606, "token_acc": 0.96008077, "grad_norm": 0.09098434, "learning_rate": 2e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354755, "epoch": 4.57070707, "global_step/max_steps": "905/990", "percentage": "91.41%", "elapsed_time": "42m 30s", "remaining_time": "3m 59s"} +{"loss": 0.01627263, "token_acc": 0.99542567, "grad_norm": 0.12259011, "learning_rate": 1.78e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355345, "epoch": 4.5959596, "global_step/max_steps": "910/990", "percentage": "91.92%", "elapsed_time": "42m 40s", "remaining_time": "3m 45s"} +{"loss": 0.07969476, "token_acc": 0.97265684, "grad_norm": 0.034348, "learning_rate": 1.56e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355547, "epoch": 4.62121212, "global_step/max_steps": "915/990", "percentage": "92.42%", "elapsed_time": "42m 53s", "remaining_time": "3m 30s"} +{"loss": 0.02643486, "token_acc": 0.98862944, "grad_norm": 0.23276809, "learning_rate": 1.36e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355964, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "43m 4s", "remaining_time": "3m 16s"} +{"eval_loss": 0.33159316, "eval_token_acc": 0.76279528, "eval_runtime": 1.2435, "eval_samples_per_second": 3.217, "eval_steps_per_second": 3.217, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "43m 5s", "remaining_time": "3m 16s"} +{"loss": 0.0149626, "token_acc": 0.95634162, "grad_norm": 0.40637901, "learning_rate": 1.18e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355848, "epoch": 4.67171717, "global_step/max_steps": "925/990", "percentage": "93.43%", "elapsed_time": "43m 19s", "remaining_time": "3m 2s"} +{"loss": 0.0939032, "token_acc": 0.95528771, "grad_norm": 0.04171985, "learning_rate": 1e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.356157, "epoch": 4.6969697, "global_step/max_steps": "930/990", "percentage": "93.94%", "elapsed_time": "43m 30s", "remaining_time": "2m 48s"} +{"loss": 0.0935572, "token_acc": 0.9625612, "grad_norm": 0.45875418, "learning_rate": 8.4e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35577, "epoch": 4.72222222, "global_step/max_steps": "935/990", "percentage": "94.44%", "elapsed_time": "43m 47s", "remaining_time": "2m 34s"} +{"loss": 0.04375721, "token_acc": 0.97534798, "grad_norm": 0.06517396, "learning_rate": 7e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355994, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "44m 0s", "remaining_time": "2m 20s"} +{"eval_loss": 0.33330181, "eval_token_acc": 0.76377953, "eval_runtime": 1.2677, "eval_samples_per_second": 3.155, "eval_steps_per_second": 3.155, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "44m 1s", "remaining_time": "2m 20s"} +{"loss": 0.0398066, "token_acc": 0.96198177, "grad_norm": 0.2442023, "learning_rate": 5.6e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355016, "epoch": 4.77272727, "global_step/max_steps": "945/990", "percentage": "95.45%", "elapsed_time": "44m 21s", "remaining_time": "2m 6s"} +{"loss": 0.0392138, "token_acc": 0.97363083, "grad_norm": 0.03199711, "learning_rate": 4.5e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354979, "epoch": 4.7979798, "global_step/max_steps": "950/990", "percentage": "95.96%", "elapsed_time": "44m 35s", "remaining_time": "1m 52s"} +{"loss": 0.0167478, "token_acc": 0.9951303, "grad_norm": 0.41990209, "learning_rate": 3.4e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355094, "epoch": 4.82323232, "global_step/max_steps": "955/990", "percentage": "96.46%", "elapsed_time": "44m 49s", "remaining_time": "1m 38s"} +{"loss": 0.08067427, "token_acc": 0.96310298, "grad_norm": 0.56480831, "learning_rate": 2.5e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355274, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "45m 1s", "remaining_time": "1m 24s"} +{"eval_loss": 0.33460578, "eval_token_acc": 0.75885827, "eval_runtime": 1.2582, "eval_samples_per_second": 3.179, "eval_steps_per_second": 3.179, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "45m 3s", "remaining_time": "1m 24s"} +{"loss": 0.04347405, "token_acc": 0.94731232, "grad_norm": 0.40066797, "learning_rate": 1.7e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35506, "epoch": 4.87373737, "global_step/max_steps": "965/990", "percentage": "97.47%", "elapsed_time": "45m 17s", "remaining_time": "1m 10s"} +{"loss": 0.04453699, "token_acc": 0.98678341, "grad_norm": 0.48766336, "learning_rate": 1.1e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355439, "epoch": 4.8989899, "global_step/max_steps": "970/990", "percentage": "97.98%", "elapsed_time": "45m 28s", "remaining_time": "56s"} +{"loss": 0.02828829, "token_acc": 0.99342916, "grad_norm": 0.2543768, "learning_rate": 6e-08, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355931, "epoch": 4.92424242, "global_step/max_steps": "975/990", "percentage": "98.48%", "elapsed_time": "45m 38s", "remaining_time": "42s"} +{"loss": 0.07211888, "token_acc": 0.96493092, "grad_norm": 0.14917304, "learning_rate": 3e-08, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355974, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "45m 52s", "remaining_time": "28s"} +{"eval_loss": 0.3347429, "eval_token_acc": 0.75984252, "eval_runtime": 1.245, "eval_samples_per_second": 3.213, "eval_steps_per_second": 3.213, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "45m 53s", "remaining_time": "28s"} +{"loss": 0.14155207, "token_acc": 0.92223267, "grad_norm": 0.32788646, "learning_rate": 1e-08, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355457, "epoch": 4.97474747, "global_step/max_steps": "985/990", "percentage": "99.49%", "elapsed_time": "46m 10s", "remaining_time": "14s"} +{"loss": 0.01627692, "token_acc": 0.99471069, "grad_norm": 0.20631926, "learning_rate": 0.0, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355697, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 22s", "remaining_time": "0s"} +{"eval_loss": 0.33335516, "eval_token_acc": 0.76181102, "eval_runtime": 1.2395, "eval_samples_per_second": 3.227, "eval_steps_per_second": 3.227, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 24s", "remaining_time": "0s"} +{"train_runtime": 2786.0397, "train_samples_per_second": 0.711, "train_steps_per_second": 0.355, "total_flos": 2.99483427739392e+17, "train_loss": 0.23823543, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 26s", "remaining_time": "0s"} +{"train_dataset": "784.851010±638.096273, min=60.000000, max=4149.000000, size=396", "val_dataset": "325.750000±308.768825, min=104.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 32898.0941M Params (134.2177M Trainable [0.4080%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-990", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/checkpoint-600", "best_metric": 0.30427584, "global_step": 990, "log_history": [{"loss": 0.594012975692749, "token_acc": 0.8394495412844036, "grad_norm": 0.15279237926006317, "learning_rate": 2.0000000000000003e-06, "memory(GiB)": 71.9, "train_speed(iter/s)": 0.211078, "epoch": 0.005050505050505051, "step": 1}, {"loss": 0.7095059156417847, "token_acc": 0.829162656400385, "grad_norm": 0.3137172758579254, "learning_rate": 1e-05, "memory(GiB)": 81.34, "train_speed(iter/s)": 0.342487, "epoch": 0.025252525252525252, "step": 5}, {"loss": 0.7598193645477295, "token_acc": 0.7958339958657974, "grad_norm": 0.1773456335067749, "learning_rate": 2e-05, "memory(GiB)": 87.85, "train_speed(iter/s)": 0.368643, "epoch": 0.050505050505050504, "step": 10}, {"loss": 0.7279319763183594, "token_acc": 0.8007322175732218, "grad_norm": 0.19639591872692108, "learning_rate": 3e-05, "memory(GiB)": 98.3, "train_speed(iter/s)": 0.370534, "epoch": 0.07575757575757576, "step": 15}, {"loss": 1.0443785667419434, "token_acc": 0.8557748113755078, "grad_norm": 2.970768928527832, "learning_rate": 4e-05, "memory(GiB)": 98.3, "train_speed(iter/s)": 0.390742, "epoch": 0.10101010101010101, "step": 20}, {"eval_loss": 1.6980665922164917, "eval_token_acc": 0.7106299212598425, "eval_runtime": 1.2677, "eval_samples_per_second": 3.155, "eval_steps_per_second": 3.155, "epoch": 0.10101010101010101, "step": 20}, {"loss": 0.5366989612579346, "token_acc": 0.8295410346168806, "grad_norm": 0.24524027109146118, "learning_rate": 5e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.358309, "epoch": 0.12626262626262627, "step": 25}, {"loss": 0.6157921314239502, "token_acc": 0.8143241564893396, "grad_norm": 0.46116694808006287, "learning_rate": 6e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.372063, "epoch": 0.15151515151515152, "step": 30}, {"loss": 0.3712780952453613, "token_acc": 0.8573959255978743, "grad_norm": 0.15954609215259552, "learning_rate": 7e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.380317, "epoch": 0.17676767676767677, "step": 35}, {"loss": 0.44252305030822753, "token_acc": 0.8448576409064498, "grad_norm": 0.11871866136789322, "learning_rate": 8e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.382931, "epoch": 0.20202020202020202, "step": 40}, {"eval_loss": 0.6573391556739807, "eval_token_acc": 0.7431102362204725, "eval_runtime": 1.307, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "epoch": 0.20202020202020202, "step": 40}, {"loss": 0.45000429153442384, "token_acc": 0.8373225152129817, "grad_norm": 0.21376259624958038, "learning_rate": 9e-05, "memory(GiB)": 108.29, "train_speed(iter/s)": 0.369324, "epoch": 0.22727272727272727, "step": 45}, {"loss": 0.5050764560699463, "token_acc": 0.8796895213454075, "grad_norm": 0.1434803009033203, "learning_rate": 0.0001, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.359356, "epoch": 0.25252525252525254, "step": 50}, {"loss": 0.45570597648620603, "token_acc": 0.8511966701352758, "grad_norm": 0.19765082001686096, "learning_rate": 9.999301905929286e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.35956, "epoch": 0.2777777777777778, "step": 55}, {"loss": 0.3813853025436401, "token_acc": 0.8593534125449019, "grad_norm": 0.19557587802410126, "learning_rate": 9.997207818651274e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.367382, "epoch": 0.30303030303030304, "step": 60}, {"eval_loss": 0.5886135101318359, "eval_token_acc": 0.7509842519685039, "eval_runtime": 1.2814, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "epoch": 0.30303030303030304, "step": 60}, {"loss": 0.550228500366211, "token_acc": 0.826555830150528, "grad_norm": 0.2849789559841156, "learning_rate": 9.99371832291393e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.36018, "epoch": 0.3282828282828283, "step": 65}, {"loss": 0.41207499504089357, "token_acc": 0.8746200607902735, "grad_norm": 0.1136852502822876, "learning_rate": 9.988834393115767e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.357839, "epoch": 0.35353535353535354, "step": 70}, {"loss": 0.5810668468475342, "token_acc": 0.8287886733088621, "grad_norm": 0.17141863703727722, "learning_rate": 9.982557393033758e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.364223, "epoch": 0.3787878787878788, "step": 75}, {"loss": 0.6472435474395752, "token_acc": 0.841726618705036, "grad_norm": 0.170815110206604, "learning_rate": 9.974889075442521e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.366011, "epoch": 0.40404040404040403, "step": 80}, {"eval_loss": 0.5714729428291321, "eval_token_acc": 0.7509842519685039, "eval_runtime": 1.2893, "eval_samples_per_second": 3.103, "eval_steps_per_second": 3.103, "epoch": 0.40404040404040403, "step": 80}, {"loss": 0.5299827575683593, "token_acc": 0.8189440290052119, "grad_norm": 0.1489511877298355, "learning_rate": 9.965831581624871e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.361053, "epoch": 0.4292929292929293, "step": 85}, {"loss": 0.4386009693145752, "token_acc": 0.8471794871794872, "grad_norm": 0.33635759353637695, "learning_rate": 9.9553874407739e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.363532, "epoch": 0.45454545454545453, "step": 90}, {"loss": 0.3524549722671509, "token_acc": 0.8693969284554875, "grad_norm": 0.13267917931079865, "learning_rate": 9.94355956928673e-05, "memory(GiB)": 118.38, "train_speed(iter/s)": 0.362313, "epoch": 0.4797979797979798, "step": 95}, {"loss": 0.4188431739807129, "token_acc": 0.8678135405105438, "grad_norm": 0.1372370421886444, "learning_rate": 9.930351269950143e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35826, "epoch": 0.5050505050505051, "step": 100}, {"eval_loss": 0.565014660358429, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.3087, "eval_samples_per_second": 3.057, "eval_steps_per_second": 3.057, "epoch": 0.5050505050505051, "step": 100}, {"loss": 0.5471820831298828, "token_acc": 0.8052507836990596, "grad_norm": 0.18173210322856903, "learning_rate": 9.915766231018318e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356931, "epoch": 0.5303030303030303, "step": 105}, {"loss": 0.45682454109191895, "token_acc": 0.8191489361702128, "grad_norm": 0.5932771563529968, "learning_rate": 9.899808525182935e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.362578, "epoch": 0.5555555555555556, "step": 110}, {"loss": 0.44985551834106446, "token_acc": 0.8674093690073966, "grad_norm": 0.10078100115060806, "learning_rate": 9.882482608435923e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357889, "epoch": 0.5808080808080808, "step": 115}, {"loss": 0.5030841827392578, "token_acc": 0.8129452223041022, "grad_norm": 0.299698144197464, "learning_rate": 9.863793318825186e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357367, "epoch": 0.6060606060606061, "step": 120}, {"eval_loss": 0.5346428155899048, "eval_token_acc": 0.7608267716535433, "eval_runtime": 1.2611, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "epoch": 0.6060606060606061, "step": 120}, {"loss": 0.4732785701751709, "token_acc": 0.8376591971626632, "grad_norm": 0.20031973719596863, "learning_rate": 9.843745875103627e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356721, "epoch": 0.6313131313131313, "step": 125}, {"loss": 0.48586230278015136, "token_acc": 0.8409764190069914, "grad_norm": 0.21032185852527618, "learning_rate": 9.822345875271883e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.3569, "epoch": 0.6565656565656566, "step": 130}, {"loss": 0.362222957611084, "token_acc": 0.8709891275523733, "grad_norm": 0.10959025472402573, "learning_rate": 9.799599295015154e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356829, "epoch": 0.6818181818181818, "step": 135}, {"loss": 0.5328386306762696, "token_acc": 0.8266873144921926, "grad_norm": 0.20188008248806, "learning_rate": 9.775512486034563e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35701, "epoch": 0.7070707070707071, "step": 140}, {"eval_loss": 0.49480319023132324, "eval_token_acc": 0.7519685039370079, "eval_runtime": 1.2596, "eval_samples_per_second": 3.176, "eval_steps_per_second": 3.176, "epoch": 0.7070707070707071, "step": 140}, {"loss": 0.3609034061431885, "token_acc": 0.856077862911576, "grad_norm": 0.21190020442008972, "learning_rate": 9.750092174273521e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.353392, "epoch": 0.7323232323232324, "step": 145}, {"loss": 0.37536747455596925, "token_acc": 0.8812260536398467, "grad_norm": 0.42249825596809387, "learning_rate": 9.723345458039594e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354546, "epoch": 0.7575757575757576, "step": 150}, {"loss": 0.41838736534118653, "token_acc": 0.857667360176233, "grad_norm": 0.2383793443441391, "learning_rate": 9.69527980602239e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354581, "epoch": 0.7828282828282829, "step": 155}, {"loss": 0.3394253969192505, "token_acc": 0.8810381038103811, "grad_norm": 0.20167972147464752, "learning_rate": 9.665903055208014e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356298, "epoch": 0.8080808080808081, "step": 160}, {"eval_loss": 0.48000994324684143, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.2638, "eval_samples_per_second": 3.165, "eval_steps_per_second": 3.165, "epoch": 0.8080808080808081, "step": 160}, {"loss": 0.41873645782470703, "token_acc": 0.8491237317169588, "grad_norm": 0.1416829228401184, "learning_rate": 9.635223408690688e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.354225, "epoch": 0.8333333333333334, "step": 165}, {"loss": 0.4564688205718994, "token_acc": 0.8519195612431444, "grad_norm": 0.20265255868434906, "learning_rate": 9.603249433382144e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.35386, "epoch": 0.8585858585858586, "step": 170}, {"loss": 0.41248092651367185, "token_acc": 0.8566410170625627, "grad_norm": 0.282372385263443, "learning_rate": 9.569990057619414e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.355591, "epoch": 0.8838383838383839, "step": 175}, {"loss": 0.4143404006958008, "token_acc": 0.8632127625967462, "grad_norm": 0.1956530064344406, "learning_rate": 9.535454568671704e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.356508, "epoch": 0.9090909090909091, "step": 180}, {"eval_loss": 0.4819040894508362, "eval_token_acc": 0.7637795275590551, "eval_runtime": 1.2641, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 0.9090909090909091, "step": 180}, {"loss": 0.4956723690032959, "token_acc": 0.8175658720200752, "grad_norm": 0.25014156103134155, "learning_rate": 9.49965261014704e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.355286, "epoch": 0.9343434343434344, "step": 185}, {"loss": 0.5832932949066162, "token_acc": 0.8024002232765839, "grad_norm": 3.0596258640289307, "learning_rate": 9.462594179299406e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.357668, "epoch": 0.9595959595959596, "step": 190}, {"loss": 0.559151029586792, "token_acc": 0.8146739738284309, "grad_norm": 0.1552937924861908, "learning_rate": 9.424289624237144e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.3574, "epoch": 0.9848484848484849, "step": 195}, {"loss": 0.5025578498840332, "token_acc": 0.8483137494277431, "grad_norm": 0.15638215839862823, "learning_rate": 9.384749641033359e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.353969, "epoch": 1.0101010101010102, "step": 200}, {"eval_loss": 0.4734075665473938, "eval_token_acc": 0.7667322834645669, "eval_runtime": 1.2465, "eval_samples_per_second": 3.209, "eval_steps_per_second": 3.209, "epoch": 1.0101010101010102, "step": 200}, {"loss": 0.45037288665771485, "token_acc": 0.835399107585523, "grad_norm": 0.16425618529319763, "learning_rate": 9.343985270739182e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350091, "epoch": 1.0353535353535352, "step": 205}, {"loss": 0.42234115600585936, "token_acc": 0.8614671060661541, "grad_norm": 0.21121171116828918, "learning_rate": 9.302007896300698e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350171, "epoch": 1.0606060606060606, "step": 210}, {"loss": 0.3796833992004395, "token_acc": 0.8746039856923863, "grad_norm": 0.18083210289478302, "learning_rate": 9.25882923938038e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349555, "epoch": 1.0858585858585859, "step": 215}, {"loss": 0.3580619812011719, "token_acc": 0.8796054540179866, "grad_norm": 0.30925488471984863, "learning_rate": 9.214461357083985e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350637, "epoch": 1.1111111111111112, "step": 220}, {"eval_loss": 0.4918195605278015, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2455, "eval_samples_per_second": 3.212, "eval_steps_per_second": 3.212, "epoch": 1.1111111111111112, "step": 220}, {"loss": 0.4854443550109863, "token_acc": 0.8444093422091843, "grad_norm": 0.36294564604759216, "learning_rate": 9.168916638593736e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349163, "epoch": 1.1363636363636362, "step": 225}, {"loss": 0.39215381145477296, "token_acc": 0.8688397695020211, "grad_norm": 0.2092873752117157, "learning_rate": 9.122207801708802e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.347175, "epoch": 1.1616161616161615, "step": 230}, {"loss": 0.20322649478912352, "token_acc": 0.9091831557584982, "grad_norm": 0.24130743741989136, "learning_rate": 9.074347889294016e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.349366, "epoch": 1.1868686868686869, "step": 235}, {"loss": 0.44439196586608887, "token_acc": 0.8551265412070085, "grad_norm": 0.4042667746543884, "learning_rate": 9.025350265637815e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350626, "epoch": 1.2121212121212122, "step": 240}, {"eval_loss": 0.5139080882072449, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2422, "eval_samples_per_second": 3.22, "eval_steps_per_second": 3.22, "epoch": 1.2121212121212122, "step": 240}, {"loss": 0.29034128189086916, "token_acc": 0.8568427855873324, "grad_norm": 0.2574117183685303, "learning_rate": 8.975228612720416e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.350706, "epoch": 1.2373737373737375, "step": 245}, {"loss": 0.4220092296600342, "token_acc": 0.8524024422617468, "grad_norm": 0.49813297390937805, "learning_rate": 8.923996926393305e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.352825, "epoch": 1.2626262626262625, "step": 250}, {"loss": 0.37980899810791013, "token_acc": 0.8598113725849281, "grad_norm": 0.2515278458595276, "learning_rate": 8.871669512471068e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.351391, "epoch": 1.2878787878787878, "step": 255}, {"loss": 0.38159129619598386, "token_acc": 0.8543320676561961, "grad_norm": 0.43190768361091614, "learning_rate": 8.818260982736661e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.352512, "epoch": 1.3131313131313131, "step": 260}, {"eval_loss": 0.4426303505897522, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2477, "eval_samples_per_second": 3.206, "eval_steps_per_second": 3.206, "epoch": 1.3131313131313131, "step": 260}, {"loss": 0.30707058906555174, "token_acc": 0.8950704812745016, "grad_norm": 0.12871959805488586, "learning_rate": 8.763786250861256e-05, "memory(GiB)": 132.92, "train_speed(iter/s)": 0.348922, "epoch": 1.3383838383838385, "step": 265}, {"loss": 0.2806516170501709, "token_acc": 0.8953946242081835, "grad_norm": 0.2897852659225464, "learning_rate": 8.708260528239788e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350022, "epoch": 1.3636363636363638, "step": 270}, {"loss": 0.2911843299865723, "token_acc": 0.8955285818030916, "grad_norm": 0.2160148024559021, "learning_rate": 8.651699319743347e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.349742, "epoch": 1.3888888888888888, "step": 275}, {"loss": 0.41789636611938474, "token_acc": 0.8685470085470085, "grad_norm": 0.5827478170394897, "learning_rate": 8.594118419389647e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350778, "epoch": 1.4141414141414141, "step": 280}, {"eval_loss": 0.42635607719421387, "eval_token_acc": 0.7736220472440944, "eval_runtime": 1.244, "eval_samples_per_second": 3.215, "eval_steps_per_second": 3.215, "epoch": 1.4141414141414141, "step": 280}, {"loss": 0.23049118518829345, "token_acc": 0.8896126157010628, "grad_norm": 0.5320525765419006, "learning_rate": 8.535533905932738e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.350607, "epoch": 1.4393939393939394, "step": 285}, {"loss": 0.36380805969238283, "token_acc": 0.8619923216811477, "grad_norm": 0.4473183751106262, "learning_rate": 8.475962138373213e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352023, "epoch": 1.4646464646464645, "step": 290}, {"loss": 0.395569372177124, "token_acc": 0.8510589842860397, "grad_norm": 0.30866506695747375, "learning_rate": 8.415419751390155e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353538, "epoch": 1.4898989898989898, "step": 295}, {"loss": 0.32523369789123535, "token_acc": 0.8855692530819435, "grad_norm": 1.6120028495788574, "learning_rate": 8.353923650696118e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353789, "epoch": 1.5151515151515151, "step": 300}, {"eval_loss": 0.45433884859085083, "eval_token_acc": 0.7775590551181102, "eval_runtime": 1.2577, "eval_samples_per_second": 3.18, "eval_steps_per_second": 3.18, "epoch": 1.5151515151515151, "step": 300}, {"loss": 0.3657505035400391, "token_acc": 0.8681001582425766, "grad_norm": 0.3755347430706024, "learning_rate": 8.291491008316409e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.351379, "epoch": 1.5404040404040404, "step": 305}, {"loss": 0.30150370597839354, "token_acc": 0.900846170535908, "grad_norm": 0.33566632866859436, "learning_rate": 8.228139257794012e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352623, "epoch": 1.5656565656565657, "step": 310}, {"loss": 0.3159091234207153, "token_acc": 0.8779527559055118, "grad_norm": 0.8437044024467468, "learning_rate": 8.163886089321493e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353918, "epoch": 1.5909090909090908, "step": 315}, {"loss": 0.33113200664520265, "token_acc": 0.9020088943413587, "grad_norm": 0.3801237642765045, "learning_rate": 8.098749444801224e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354333, "epoch": 1.6161616161616161, "step": 320}, {"eval_loss": 0.43398547172546387, "eval_token_acc": 0.7726377952755905, "eval_runtime": 1.2796, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "epoch": 1.6161616161616161, "step": 320}, {"loss": 0.34933011531829833, "token_acc": 0.8527973927213471, "grad_norm": 0.37100571393966675, "learning_rate": 8.032747512835337e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353766, "epoch": 1.6414141414141414, "step": 325}, {"loss": 0.37286627292633057, "token_acc": 0.8843691926491843, "grad_norm": 0.580182671546936, "learning_rate": 7.965898723646776e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354876, "epoch": 1.6666666666666665, "step": 330}, {"loss": 0.3794433116912842, "token_acc": 0.8727474355420016, "grad_norm": 0.3128257989883423, "learning_rate": 7.898221743932888e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354891, "epoch": 1.691919191919192, "step": 335}, {"loss": 0.262727689743042, "token_acc": 0.9075043630017452, "grad_norm": 0.8151857852935791, "learning_rate": 7.829735471652978e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355958, "epoch": 1.7171717171717171, "step": 340}, {"eval_loss": 0.37076544761657715, "eval_token_acc": 0.7755905511811023, "eval_runtime": 1.2578, "eval_samples_per_second": 3.18, "eval_steps_per_second": 3.18, "epoch": 1.7171717171717171, "step": 340}, {"loss": 0.28121564388275144, "token_acc": 0.8951330717845404, "grad_norm": 0.32429400086402893, "learning_rate": 7.760459030751284e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354629, "epoch": 1.7424242424242424, "step": 345}, {"loss": 0.20163230895996093, "token_acc": 0.920251572327044, "grad_norm": 1.0614266395568848, "learning_rate": 7.690411765816864e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355616, "epoch": 1.7676767676767677, "step": 350}, {"loss": 0.4713843822479248, "token_acc": 0.8495345016429354, "grad_norm": 0.626061737537384, "learning_rate": 7.619613236681843e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355891, "epoch": 1.7929292929292928, "step": 355}, {"loss": 0.25896482467651366, "token_acc": 0.8964471929186371, "grad_norm": 0.2624683976173401, "learning_rate": 7.548083212959588e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356199, "epoch": 1.8181818181818183, "step": 360}, {"eval_loss": 0.3961385488510132, "eval_token_acc": 0.7765748031496063, "eval_runtime": 1.2624, "eval_samples_per_second": 3.168, "eval_steps_per_second": 3.168, "epoch": 1.8181818181818183, "step": 360}, {"loss": 0.38772385120391845, "token_acc": 0.8543818727090969, "grad_norm": 0.7176984548568726, "learning_rate": 7.475841668524268e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356015, "epoch": 1.8434343434343434, "step": 365}, {"loss": 0.34966278076171875, "token_acc": 0.8757403751233959, "grad_norm": 0.27225521206855774, "learning_rate": 7.402908775933419e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356009, "epoch": 1.8686868686868687, "step": 370}, {"loss": 0.4062873363494873, "token_acc": 0.8658723605048956, "grad_norm": 0.4194507300853729, "learning_rate": 7.329304900794991e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355662, "epoch": 1.893939393939394, "step": 375}, {"loss": 0.352255654335022, "token_acc": 0.8888125343595382, "grad_norm": 0.6045131683349609, "learning_rate": 7.255050596080509e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356078, "epoch": 1.9191919191919191, "step": 380}, {"eval_loss": 0.38526344299316406, "eval_token_acc": 0.7726377952755905, "eval_runtime": 1.2874, "eval_samples_per_second": 3.107, "eval_steps_per_second": 3.107, "epoch": 1.9191919191919191, "step": 380}, {"loss": 0.3758531093597412, "token_acc": 0.8816031376394166, "grad_norm": 0.16466927528381348, "learning_rate": 7.180166596385914e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354701, "epoch": 1.9444444444444444, "step": 385}, {"loss": 0.25691215991973876, "token_acc": 0.8995055766356215, "grad_norm": 0.324630469083786, "learning_rate": 7.104673812141675e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354307, "epoch": 1.9696969696969697, "step": 390}, {"loss": 0.2570985794067383, "token_acc": 0.899527983816588, "grad_norm": 0.3074846863746643, "learning_rate": 7.02859332377382e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35515, "epoch": 1.9949494949494948, "step": 395}, {"loss": 0.22938873767852783, "token_acc": 0.9419516786946972, "grad_norm": 0.5683487057685852, "learning_rate": 6.951946375817474e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356409, "epoch": 2.0202020202020203, "step": 400}, {"eval_loss": 0.3636195659637451, "eval_token_acc": 0.7677165354330708, "eval_runtime": 1.2559, "eval_samples_per_second": 3.185, "eval_steps_per_second": 3.185, "epoch": 2.0202020202020203, "step": 400}, {"loss": 0.15864256620407105, "token_acc": 0.9092146454335103, "grad_norm": 0.6048702001571655, "learning_rate": 6.874754370984606e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355742, "epoch": 2.0454545454545454, "step": 405}, {"loss": 0.3233179092407227, "token_acc": 0.9176615891313298, "grad_norm": 0.7089707851409912, "learning_rate": 6.797038864187564e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355669, "epoch": 2.0707070707070705, "step": 410}, {"loss": 0.19509116411209107, "token_acc": 0.9241970021413276, "grad_norm": 0.370557576417923, "learning_rate": 6.718821556520151e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356398, "epoch": 2.095959595959596, "step": 415}, {"loss": 0.09936256408691406, "token_acc": 0.9714867617107943, "grad_norm": 0.2815419137477875, "learning_rate": 6.640124289197845e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35724, "epoch": 2.121212121212121, "step": 420}, {"eval_loss": 0.37009483575820923, "eval_token_acc": 0.7736220472440944, "eval_runtime": 1.2502, "eval_samples_per_second": 3.199, "eval_steps_per_second": 3.199, "epoch": 2.121212121212121, "step": 420}, {"loss": 0.18864725828170775, "token_acc": 0.8931275480489226, "grad_norm": 0.6189222931861877, "learning_rate": 6.560969037458933e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357016, "epoch": 2.1464646464646466, "step": 425}, {"loss": 0.13603065013885499, "token_acc": 0.9594075079149706, "grad_norm": 0.4722209870815277, "learning_rate": 6.481377904428171e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356484, "epoch": 2.1717171717171717, "step": 430}, {"loss": 0.1884603261947632, "token_acc": 0.9427539503386004, "grad_norm": 0.25799301266670227, "learning_rate": 6.401373114944781e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355717, "epoch": 2.196969696969697, "step": 435}, {"loss": 0.2138120174407959, "token_acc": 0.9274074074074075, "grad_norm": 0.847876787185669, "learning_rate": 6.320977009356431e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356335, "epoch": 2.2222222222222223, "step": 440}, {"eval_loss": 0.378639817237854, "eval_token_acc": 0.7736220472440944, "eval_runtime": 1.236, "eval_samples_per_second": 3.236, "eval_steps_per_second": 3.236, "epoch": 2.2222222222222223, "step": 440}, {"loss": 0.12016980648040772, "token_acc": 0.9293953606287235, "grad_norm": 0.38243409991264343, "learning_rate": 6.240212037280966e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355702, "epoch": 2.2474747474747474, "step": 445}, {"loss": 0.2691728830337524, "token_acc": 0.905337548819044, "grad_norm": 0.717224657535553, "learning_rate": 6.159100751337642e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356304, "epoch": 2.2727272727272725, "step": 450}, {"loss": 0.18497172594070435, "token_acc": 0.9337628865979382, "grad_norm": 0.24402710795402527, "learning_rate": 6.077665800849568e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356536, "epoch": 2.297979797979798, "step": 455}, {"loss": 0.19234393835067748, "token_acc": 0.9360821581851625, "grad_norm": 0.246830552816391, "learning_rate": 5.99592992551918e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356885, "epoch": 2.323232323232323, "step": 460}, {"eval_loss": 0.3614441454410553, "eval_token_acc": 0.7716535433070866, "eval_runtime": 1.2634, "eval_samples_per_second": 3.166, "eval_steps_per_second": 3.166, "epoch": 2.323232323232323, "step": 460}, {"loss": 0.17323193550109864, "token_acc": 0.8919093851132686, "grad_norm": 0.48954588174819946, "learning_rate": 5.913915949078452e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357128, "epoch": 2.3484848484848486, "step": 465}, {"loss": 0.13749444484710693, "token_acc": 0.9434219495569189, "grad_norm": 0.5584018230438232, "learning_rate": 5.831646772915651e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35673, "epoch": 2.3737373737373737, "step": 470}, {"loss": 0.20462331771850586, "token_acc": 0.9227237949502678, "grad_norm": 1.301836371421814, "learning_rate": 5.749145369680407e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357258, "epoch": 2.398989898989899, "step": 475}, {"loss": 0.20529029369354249, "token_acc": 0.9261273320505312, "grad_norm": 0.33143967390060425, "learning_rate": 5.666434776868895e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356138, "epoch": 2.4242424242424243, "step": 480}, {"eval_loss": 0.3671746850013733, "eval_token_acc": 0.7618110236220472, "eval_runtime": 1.2559, "eval_samples_per_second": 3.185, "eval_steps_per_second": 3.185, "epoch": 2.4242424242424243, "step": 480}, {"loss": 0.148415470123291, "token_acc": 0.8970821081203347, "grad_norm": 0.6394232511520386, "learning_rate": 5.583538090390882e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356392, "epoch": 2.4494949494949494, "step": 485}, {"loss": 0.2857876539230347, "token_acc": 0.895648670427075, "grad_norm": 0.5502394437789917, "learning_rate": 5.5004784581204927e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356615, "epoch": 2.474747474747475, "step": 490}, {"loss": 0.15979899168014527, "token_acc": 0.9382183908045977, "grad_norm": 0.2799193263053894, "learning_rate": 5.41727907343245e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357352, "epoch": 2.5, "step": 495}, {"loss": 0.23016483783721925, "token_acc": 0.8928835262250677, "grad_norm": 0.47879472374916077, "learning_rate": 5.3339631687256084e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357166, "epoch": 2.525252525252525, "step": 500}, {"eval_loss": 0.36765411496162415, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2648, "eval_samples_per_second": 3.162, "eval_steps_per_second": 3.162, "epoch": 2.525252525252525, "step": 500}, {"loss": 0.15926196575164794, "token_acc": 0.90470706779905, "grad_norm": 0.5626565217971802, "learning_rate": 5.250554008935596e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356766, "epoch": 2.5505050505050506, "step": 505}, {"loss": 0.15416876077651978, "token_acc": 0.9385658067337123, "grad_norm": 0.7293064594268799, "learning_rate": 5.167074885038373e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357626, "epoch": 2.5757575757575757, "step": 510}, {"loss": 0.18747940063476562, "token_acc": 0.9271042471042471, "grad_norm": 0.32495784759521484, "learning_rate": 5.0835491075465045e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.358064, "epoch": 2.601010101010101, "step": 515}, {"loss": 0.19285820722579955, "token_acc": 0.9182068423122296, "grad_norm": 0.372670978307724, "learning_rate": 5e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.358116, "epoch": 2.6262626262626263, "step": 520}, {"eval_loss": 0.35743266344070435, "eval_token_acc": 0.7687007874015748, "eval_runtime": 1.2997, "eval_samples_per_second": 3.078, "eval_steps_per_second": 3.078, "epoch": 2.6262626262626263, "step": 520}, {"loss": 0.1654897451400757, "token_acc": 0.9160741885625966, "grad_norm": 0.4129459857940674, "learning_rate": 4.916450892453495e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357945, "epoch": 2.6515151515151514, "step": 525}, {"loss": 0.22736096382141113, "token_acc": 0.9109231599784056, "grad_norm": 0.3547585904598236, "learning_rate": 4.832925114961629e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.357177, "epoch": 2.676767676767677, "step": 530}, {"loss": 0.16493122577667235, "token_acc": 0.9458256432526327, "grad_norm": 0.7628602385520935, "learning_rate": 4.749445991064404e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356557, "epoch": 2.702020202020202, "step": 535}, {"loss": 0.2907134771347046, "token_acc": 0.8969603297269448, "grad_norm": 0.9734154343605042, "learning_rate": 4.666036831274392e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356129, "epoch": 2.7272727272727275, "step": 540}, {"eval_loss": 0.33516690135002136, "eval_token_acc": 0.7716535433070866, "eval_runtime": 1.2565, "eval_samples_per_second": 3.183, "eval_steps_per_second": 3.183, "epoch": 2.7272727272727275, "step": 540}, {"loss": 0.2214029312133789, "token_acc": 0.8890953431657183, "grad_norm": 0.566674530506134, "learning_rate": 4.582720926567552e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355557, "epoch": 2.7525252525252526, "step": 545}, {"loss": 0.17021151781082153, "token_acc": 0.9310242307120559, "grad_norm": 0.42045527696609497, "learning_rate": 4.4995215418795085e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355665, "epoch": 2.7777777777777777, "step": 550}, {"loss": 0.18070143461227417, "token_acc": 0.939800327819997, "grad_norm": 0.6601650714874268, "learning_rate": 4.416461909609119e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356126, "epoch": 2.8030303030303028, "step": 555}, {"loss": 0.15745289325714112, "token_acc": 0.9293805736322005, "grad_norm": 0.25845786929130554, "learning_rate": 4.333565223131107e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.356108, "epoch": 2.8282828282828283, "step": 560}, {"eval_loss": 0.33242106437683105, "eval_token_acc": 0.7696850393700787, "eval_runtime": 1.2648, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.8282828282828283, "step": 560}, {"loss": 0.20625925064086914, "token_acc": 0.9046810317376075, "grad_norm": 0.7115989923477173, "learning_rate": 4.250854630319593e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355539, "epoch": 2.8535353535353534, "step": 565}, {"loss": 0.25417945384979246, "token_acc": 0.9046015712682379, "grad_norm": 0.6034452319145203, "learning_rate": 4.1683532270843504e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355222, "epoch": 2.878787878787879, "step": 570}, {"loss": 0.1719497799873352, "token_acc": 0.9340673744920698, "grad_norm": 0.9026182889938354, "learning_rate": 4.0860840509215496e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35529, "epoch": 2.904040404040404, "step": 575}, {"loss": 0.2509638786315918, "token_acc": 0.8932173225232352, "grad_norm": 2.0807785987854004, "learning_rate": 4.0040700744808204e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355904, "epoch": 2.929292929292929, "step": 580}, {"eval_loss": 0.3441176414489746, "eval_token_acc": 0.7696850393700787, "eval_runtime": 1.2647, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.929292929292929, "step": 580}, {"loss": 0.21597733497619628, "token_acc": 0.8890164561806353, "grad_norm": 0.8126916885375977, "learning_rate": 3.922334199150432e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355876, "epoch": 2.9545454545454546, "step": 585}, {"loss": 0.20148119926452637, "token_acc": 0.9332627118644068, "grad_norm": 0.702083945274353, "learning_rate": 3.840899248662358e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355335, "epoch": 2.9797979797979797, "step": 590}, {"loss": 0.2256376028060913, "token_acc": 0.932657200811359, "grad_norm": 0.25534525513648987, "learning_rate": 3.7597879627190334e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354817, "epoch": 3.005050505050505, "step": 595}, {"loss": 0.13529281616210936, "token_acc": 0.9571852479864349, "grad_norm": 0.8379502296447754, "learning_rate": 3.6790229906435705e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355006, "epoch": 3.0303030303030303, "step": 600}, {"eval_loss": 0.30427584052085876, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2499, "eval_samples_per_second": 3.2, "eval_steps_per_second": 3.2, "epoch": 3.0303030303030303, "step": 600}, {"loss": 0.09742944836616516, "token_acc": 0.9399038461538461, "grad_norm": 0.5740740895271301, "learning_rate": 3.598626885055219e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353865, "epoch": 3.0555555555555554, "step": 605}, {"loss": 0.05046466588973999, "token_acc": 0.9742566354021154, "grad_norm": 0.7276327610015869, "learning_rate": 3.5186220955718306e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354532, "epoch": 3.080808080808081, "step": 610}, {"loss": 0.03918294310569763, "token_acc": 0.9892593421347058, "grad_norm": 0.1619880646467209, "learning_rate": 3.4390309625410686e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355134, "epoch": 3.106060606060606, "step": 615}, {"loss": 0.08534240126609802, "token_acc": 0.9704122340425532, "grad_norm": 0.4289223849773407, "learning_rate": 3.3598757108021546e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355115, "epoch": 3.1313131313131315, "step": 620}, {"eval_loss": 0.3079659342765808, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.2475, "eval_samples_per_second": 3.207, "eval_steps_per_second": 3.207, "epoch": 3.1313131313131315, "step": 620}, {"loss": 0.09771488904953003, "token_acc": 0.9483132767804301, "grad_norm": 0.4063633680343628, "learning_rate": 3.281178443479852e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354176, "epoch": 3.1565656565656566, "step": 625}, {"loss": 0.0691908597946167, "token_acc": 0.9681883908283413, "grad_norm": 0.7491307258605957, "learning_rate": 3.202961135812437e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354714, "epoch": 3.1818181818181817, "step": 630}, {"loss": 0.1324032187461853, "token_acc": 0.9444506165981558, "grad_norm": 0.5784112215042114, "learning_rate": 3.1252456290153954e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354589, "epoch": 3.207070707070707, "step": 635}, {"loss": 0.11127980947494506, "token_acc": 0.9494873483209482, "grad_norm": 0.4406881630420685, "learning_rate": 3.0480536241825263e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353946, "epoch": 3.2323232323232323, "step": 640}, {"eval_loss": 0.30791735649108887, "eval_token_acc": 0.7716535433070866, "eval_runtime": 1.2574, "eval_samples_per_second": 3.181, "eval_steps_per_second": 3.181, "epoch": 3.2323232323232323, "step": 640}, {"loss": 0.08342494964599609, "token_acc": 0.9452848128619586, "grad_norm": 0.4303297996520996, "learning_rate": 2.9714066762261823e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.352926, "epoch": 3.257575757575758, "step": 645}, {"loss": 0.1292866587638855, "token_acc": 0.9530223943424819, "grad_norm": 0.2603033781051636, "learning_rate": 2.895326187858326e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353396, "epoch": 3.282828282828283, "step": 650}, {"loss": 0.09613170623779296, "token_acc": 0.9691442468460252, "grad_norm": 0.4966074824333191, "learning_rate": 2.8198334036140874e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353561, "epoch": 3.308080808080808, "step": 655}, {"loss": 0.09578182101249695, "token_acc": 0.9634032634032634, "grad_norm": 0.2868223190307617, "learning_rate": 2.74494940391949e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353616, "epoch": 3.3333333333333335, "step": 660}, {"eval_loss": 0.30913257598876953, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.251, "eval_samples_per_second": 3.197, "eval_steps_per_second": 3.197, "epoch": 3.3333333333333335, "step": 660}, {"loss": 0.09090739488601685, "token_acc": 0.9261056167195372, "grad_norm": 0.8172745108604431, "learning_rate": 2.6706950992050094e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353647, "epoch": 3.3585858585858586, "step": 665}, {"loss": 0.10645673274993897, "token_acc": 0.9591964846202135, "grad_norm": 0.8159921169281006, "learning_rate": 2.5970912240665813e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353871, "epoch": 3.3838383838383836, "step": 670}, {"loss": 0.07736409306526185, "token_acc": 0.9623198471956937, "grad_norm": 0.35854482650756836, "learning_rate": 2.5241583314757327e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354188, "epoch": 3.409090909090909, "step": 675}, {"loss": 0.12336930036544799, "token_acc": 0.9413841807909604, "grad_norm": 0.6301116943359375, "learning_rate": 2.4519167870404125e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354564, "epoch": 3.4343434343434343, "step": 680}, {"eval_loss": 0.3097744286060333, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.3015, "eval_samples_per_second": 3.073, "eval_steps_per_second": 3.073, "epoch": 3.4343434343434343, "step": 680}, {"loss": 0.11545860767364502, "token_acc": 0.9139716761783978, "grad_norm": 0.2789129614830017, "learning_rate": 2.3803867633181574e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353786, "epoch": 3.45959595959596, "step": 685}, {"loss": 0.1217241644859314, "token_acc": 0.9524612272420768, "grad_norm": 0.7183085083961487, "learning_rate": 2.3095882341831372e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353548, "epoch": 3.484848484848485, "step": 690}, {"loss": 0.11445937156677247, "token_acc": 0.9617232295056563, "grad_norm": 0.3908962905406952, "learning_rate": 2.2395409692487175e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.353948, "epoch": 3.51010101010101, "step": 695}, {"loss": 0.0395620197057724, "token_acc": 0.9872527472527473, "grad_norm": 0.22522900998592377, "learning_rate": 2.1702645283470236e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354517, "epoch": 3.5353535353535355, "step": 700}, {"eval_loss": 0.30946803092956543, "eval_token_acc": 0.7677165354330708, "eval_runtime": 1.3167, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.5353535353535355, "step": 700}, {"loss": 0.07134841680526734, "token_acc": 0.9328724758959432, "grad_norm": 0.40321338176727295, "learning_rate": 2.1017782560671123e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354512, "epoch": 3.5606060606060606, "step": 705}, {"loss": 0.09319761395454407, "token_acc": 0.9647415777359281, "grad_norm": 0.5617113709449768, "learning_rate": 2.0341012763532243e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354354, "epoch": 3.5858585858585856, "step": 710}, {"loss": 0.0075861550867557526, "token_acc": 0.9970123772940674, "grad_norm": 0.036976154893636703, "learning_rate": 1.967252487164663e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.35527, "epoch": 3.611111111111111, "step": 715}, {"loss": 0.0571125328540802, "token_acc": 0.9803678212794765, "grad_norm": 0.630945086479187, "learning_rate": 1.9012505551987765e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.355088, "epoch": 3.6363636363636362, "step": 720}, {"eval_loss": 0.3143712282180786, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.27, "eval_samples_per_second": 3.15, "eval_steps_per_second": 3.15, "epoch": 3.6363636363636362, "step": 720}, {"loss": 0.10505948066711426, "token_acc": 0.9304705882352942, "grad_norm": 0.5826022624969482, "learning_rate": 1.836113910678507e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354673, "epoch": 3.6616161616161618, "step": 725}, {"loss": 0.11193917989730835, "token_acc": 0.9555294348124204, "grad_norm": 0.31852394342422485, "learning_rate": 1.771860742205988e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354365, "epoch": 3.686868686868687, "step": 730}, {"loss": 0.0963442325592041, "token_acc": 0.9589075419694312, "grad_norm": 0.5509043335914612, "learning_rate": 1.7085089916835923e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354253, "epoch": 3.712121212121212, "step": 735}, {"loss": 0.12444987297058105, "token_acc": 0.9618570602966673, "grad_norm": 0.714759349822998, "learning_rate": 1.646076349303884e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354721, "epoch": 3.7373737373737375, "step": 740}, {"eval_loss": 0.3131199777126312, "eval_token_acc": 0.7667322834645669, "eval_runtime": 1.2723, "eval_samples_per_second": 3.144, "eval_steps_per_second": 3.144, "epoch": 3.7373737373737375, "step": 740}, {"loss": 0.06069818139076233, "token_acc": 0.9129849137931034, "grad_norm": 0.47293511033058167, "learning_rate": 1.584580248609846e-05, "memory(GiB)": 132.93, "train_speed(iter/s)": 0.354907, "epoch": 3.7626262626262625, "step": 745}, {"loss": 0.042090201377868654, "token_acc": 0.9831313851271086, "grad_norm": 0.7617067098617554, "learning_rate": 1.5240378616267886e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355507, "epoch": 3.787878787878788, "step": 750}, {"loss": 0.18900134563446044, "token_acc": 0.9208525754884547, "grad_norm": 0.46223777532577515, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354408, "epoch": 3.813131313131313, "step": 755}, {"loss": 0.01981939971446991, "token_acc": 0.9939024390243902, "grad_norm": 0.48695704340934753, "learning_rate": 1.4058815806103542e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355135, "epoch": 3.8383838383838382, "step": 760}, {"eval_loss": 0.3126079738140106, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.248, "eval_samples_per_second": 3.205, "eval_steps_per_second": 3.205, "epoch": 3.8383838383838382, "step": 760}, {"loss": 0.1109347939491272, "token_acc": 0.9204441740188621, "grad_norm": 0.6136153936386108, "learning_rate": 1.3483006802566544e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354973, "epoch": 3.8636363636363638, "step": 765}, {"loss": 0.06602246761322021, "token_acc": 0.97854340362923, "grad_norm": 0.24277837574481964, "learning_rate": 1.2917394717602121e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355116, "epoch": 3.888888888888889, "step": 770}, {"loss": 0.1160237193107605, "token_acc": 0.9735529696236965, "grad_norm": 0.574522852897644, "learning_rate": 1.2362137491387432e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35515, "epoch": 3.9141414141414144, "step": 775}, {"loss": 0.09153335094451905, "token_acc": 0.9558498896247241, "grad_norm": 0.409150630235672, "learning_rate": 1.1817390172633403e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355102, "epoch": 3.9393939393939394, "step": 780}, {"eval_loss": 0.3179924190044403, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.2772, "eval_samples_per_second": 3.132, "eval_steps_per_second": 3.132, "epoch": 3.9393939393939394, "step": 780}, {"loss": 0.07547231316566468, "token_acc": 0.9211635750421585, "grad_norm": 1.5299787521362305, "learning_rate": 1.1283304875289336e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355, "epoch": 3.9646464646464645, "step": 785}, {"loss": 0.1015932559967041, "token_acc": 0.9662865642042637, "grad_norm": 0.803305983543396, "learning_rate": 1.0760030736066951e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35536, "epoch": 3.98989898989899, "step": 790}, {"loss": 0.08443622589111328, "token_acc": 0.9721483335941167, "grad_norm": 0.4793414771556854, "learning_rate": 1.024771387279585e-05, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355607, "epoch": 4.015151515151516, "step": 795}, {"loss": 0.035999318957328795, "token_acc": 0.9878715180594428, "grad_norm": 0.348900705575943, "learning_rate": 9.746497343621857e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355646, "epoch": 4.040404040404041, "step": 800}, {"eval_loss": 0.3235396444797516, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.2641, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 4.040404040404041, "step": 800}, {"loss": 0.06672016382217408, "token_acc": 0.9496715011776373, "grad_norm": 0.6782447099685669, "learning_rate": 9.256521107059834e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355197, "epoch": 4.065656565656566, "step": 805}, {"loss": 0.01867678463459015, "token_acc": 0.9937345737611544, "grad_norm": 0.02989630214869976, "learning_rate": 8.777921982911996e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355631, "epoch": 4.090909090909091, "step": 810}, {"loss": 0.046671625971794126, "token_acc": 0.9823114975266077, "grad_norm": 0.47457507252693176, "learning_rate": 8.310833614062651e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355942, "epoch": 4.116161616161616, "step": 815}, {"loss": 0.03511995375156403, "token_acc": 0.9881111606479418, "grad_norm": 0.23747724294662476, "learning_rate": 7.85538642916015e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355854, "epoch": 4.141414141414141, "step": 820}, {"eval_loss": 0.3295816481113434, "eval_token_acc": 0.7608267716535433, "eval_runtime": 1.2496, "eval_samples_per_second": 3.201, "eval_steps_per_second": 3.201, "epoch": 4.141414141414141, "step": 820}, {"loss": 0.0608062744140625, "token_acc": 0.9560709705002137, "grad_norm": 0.46623966097831726, "learning_rate": 7.4117076061961885e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355301, "epoch": 4.166666666666667, "step": 825}, {"loss": 0.07521570324897767, "token_acc": 0.9431046931407943, "grad_norm": 0.3173864483833313, "learning_rate": 6.979921036993042e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355358, "epoch": 4.191919191919192, "step": 830}, {"loss": 0.10097864866256714, "token_acc": 0.9563927235195459, "grad_norm": 0.6492113471031189, "learning_rate": 6.5601472926081766e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355449, "epoch": 4.217171717171717, "step": 835}, {"loss": 0.07692533135414123, "token_acc": 0.9672113638790986, "grad_norm": 0.45515960454940796, "learning_rate": 6.152503589666425e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355338, "epoch": 4.242424242424242, "step": 840}, {"eval_loss": 0.3356034457683563, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2455, "eval_samples_per_second": 3.211, "eval_steps_per_second": 3.211, "epoch": 4.242424242424242, "step": 840}, {"loss": 0.14583762884140014, "token_acc": 0.9234553884000378, "grad_norm": 0.8457357287406921, "learning_rate": 5.757103757628573e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354526, "epoch": 4.267676767676767, "step": 845}, {"loss": 0.10489171743392944, "token_acc": 0.9607229402261712, "grad_norm": 0.2833046019077301, "learning_rate": 5.374058207005944e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354312, "epoch": 4.292929292929293, "step": 850}, {"loss": 0.03666624426841736, "token_acc": 0.9860793909733551, "grad_norm": 0.39615917205810547, "learning_rate": 5.0034738985296095e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354264, "epoch": 4.318181818181818, "step": 855}, {"loss": 0.03459435701370239, "token_acc": 0.9874883535205644, "grad_norm": 0.20940014719963074, "learning_rate": 4.645454313282965e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354279, "epoch": 4.343434343434343, "step": 860}, {"eval_loss": 0.3347886800765991, "eval_token_acc": 0.7608267716535433, "eval_runtime": 1.2407, "eval_samples_per_second": 3.224, "eval_steps_per_second": 3.224, "epoch": 4.343434343434343, "step": 860}, {"loss": 0.042598605155944824, "token_acc": 0.9554907934825805, "grad_norm": 0.45534172654151917, "learning_rate": 4.3000994238058644e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354035, "epoch": 4.3686868686868685, "step": 865}, {"loss": 0.04259795844554901, "token_acc": 0.9884420519316023, "grad_norm": 0.6204268336296082, "learning_rate": 3.967505666178556e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354313, "epoch": 4.393939393939394, "step": 870}, {"loss": 0.024170319736003875, "token_acc": 0.9857792946530148, "grad_norm": 0.40318140387535095, "learning_rate": 3.647765913093132e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354708, "epoch": 4.41919191919192, "step": 875}, {"loss": 0.05446074604988098, "token_acc": 0.9749515431638587, "grad_norm": 0.6216991543769836, "learning_rate": 3.340969447919873e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354846, "epoch": 4.444444444444445, "step": 880}, {"eval_loss": 0.3343099355697632, "eval_token_acc": 0.7608267716535433, "eval_runtime": 1.2375, "eval_samples_per_second": 3.232, "eval_steps_per_second": 3.232, "epoch": 4.444444444444445, "step": 880}, {"loss": 0.03617172241210938, "token_acc": 0.9599399098647972, "grad_norm": 0.325848788022995, "learning_rate": 3.0472019397761064e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354631, "epoch": 4.46969696969697, "step": 885}, {"loss": 0.05567449927330017, "token_acc": 0.9825274278748476, "grad_norm": 0.3116080164909363, "learning_rate": 2.7665454196040664e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354558, "epoch": 4.494949494949495, "step": 890}, {"loss": 0.025903385877609254, "token_acc": 0.9856645504812616, "grad_norm": 0.4584411084651947, "learning_rate": 2.4990782572647975e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354943, "epoch": 4.52020202020202, "step": 895}, {"loss": 0.11551048755645751, "token_acc": 0.9571796522858983, "grad_norm": 0.6048192381858826, "learning_rate": 2.2448751396543787e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354798, "epoch": 4.545454545454545, "step": 900}, {"eval_loss": 0.33555707335472107, "eval_token_acc": 0.7618110236220472, "eval_runtime": 1.2587, "eval_samples_per_second": 3.178, "eval_steps_per_second": 3.178, "epoch": 4.545454545454545, "step": 900}, {"loss": 0.009146060794591904, "token_acc": 0.960080770425598, "grad_norm": 0.09098433703184128, "learning_rate": 2.004007049848461e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354755, "epoch": 4.570707070707071, "step": 905}, {"loss": 0.01627262681722641, "token_acc": 0.995425667090216, "grad_norm": 0.12259010970592499, "learning_rate": 1.7765412472811771e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355345, "epoch": 4.595959595959596, "step": 910}, {"loss": 0.07969475984573364, "token_acc": 0.9726568433844751, "grad_norm": 0.03434799984097481, "learning_rate": 1.5625412489637337e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355547, "epoch": 4.621212121212121, "step": 915}, {"loss": 0.02643486261367798, "token_acc": 0.9886294416243655, "grad_norm": 0.23276808857917786, "learning_rate": 1.3620668117481472e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355964, "epoch": 4.646464646464646, "step": 920}, {"eval_loss": 0.3315931558609009, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2435, "eval_samples_per_second": 3.217, "eval_steps_per_second": 3.217, "epoch": 4.646464646464646, "step": 920}, {"loss": 0.014962595701217652, "token_acc": 0.9563416188655195, "grad_norm": 0.40637901425361633, "learning_rate": 1.1751739156407649e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355848, "epoch": 4.671717171717171, "step": 925}, {"loss": 0.09390320181846619, "token_acc": 0.9552877138413686, "grad_norm": 0.04171985015273094, "learning_rate": 1.0019147481706625e-06, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.356157, "epoch": 4.696969696969697, "step": 930}, {"loss": 0.09355719685554505, "token_acc": 0.9625611980416626, "grad_norm": 0.45875418186187744, "learning_rate": 8.423376898168245e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35577, "epoch": 4.722222222222222, "step": 935}, {"loss": 0.04375721216201782, "token_acc": 0.9753479792050981, "grad_norm": 0.06517396122217178, "learning_rate": 6.964873004985717e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355994, "epoch": 4.747474747474747, "step": 940}, {"eval_loss": 0.3333018124103546, "eval_token_acc": 0.7637795275590551, "eval_runtime": 1.2677, "eval_samples_per_second": 3.155, "eval_steps_per_second": 3.155, "epoch": 4.747474747474747, "step": 940}, {"loss": 0.03980659544467926, "token_acc": 0.9619817650094615, "grad_norm": 0.24420230090618134, "learning_rate": 5.644043071326932e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355016, "epoch": 4.7727272727272725, "step": 945}, {"loss": 0.03921380043029785, "token_acc": 0.973630831643002, "grad_norm": 0.031997114419937134, "learning_rate": 4.461255922609986e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.354979, "epoch": 4.797979797979798, "step": 950}, {"loss": 0.016747798025608062, "token_acc": 0.9951302974466965, "grad_norm": 0.41990208625793457, "learning_rate": 3.416841837512952e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355094, "epoch": 4.8232323232323235, "step": 955}, {"loss": 0.08067426681518555, "token_acc": 0.9631029789807954, "grad_norm": 0.5648083090782166, "learning_rate": 2.511092455747932e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355274, "epoch": 4.848484848484849, "step": 960}, {"eval_loss": 0.33460578322410583, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.2582, "eval_samples_per_second": 3.179, "eval_steps_per_second": 3.179, "epoch": 4.848484848484849, "step": 960}, {"loss": 0.043474048376083374, "token_acc": 0.9473123191716156, "grad_norm": 0.4006679654121399, "learning_rate": 1.7442606966242004e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.35506, "epoch": 4.873737373737374, "step": 965}, {"loss": 0.04453698992729187, "token_acc": 0.9867834131835453, "grad_norm": 0.4876633584499359, "learning_rate": 1.1165606884234181e-07, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355439, "epoch": 4.898989898989899, "step": 970}, {"loss": 0.028288286924362183, "token_acc": 0.993429158110883, "grad_norm": 0.2543767988681793, "learning_rate": 6.281677086071303e-08, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355931, "epoch": 4.924242424242424, "step": 975}, {"loss": 0.07211887836456299, "token_acc": 0.9649309245483528, "grad_norm": 0.14917303621768951, "learning_rate": 2.792181348726941e-08, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355974, "epoch": 4.94949494949495, "step": 980}, {"eval_loss": 0.3347429037094116, "eval_token_acc": 0.7598425196850394, "eval_runtime": 1.245, "eval_samples_per_second": 3.213, "eval_steps_per_second": 3.213, "epoch": 4.94949494949495, "step": 980}, {"loss": 0.14155206680297852, "token_acc": 0.9222326748196927, "grad_norm": 0.3278864622116089, "learning_rate": 6.980940707146389e-09, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355457, "epoch": 4.974747474747475, "step": 985}, {"loss": 0.016276916861534117, "token_acc": 0.9947106908158359, "grad_norm": 0.20631925761699677, "learning_rate": 0.0, "memory(GiB)": 132.94, "train_speed(iter/s)": 0.355697, "epoch": 5.0, "step": 990}, {"eval_loss": 0.3333551585674286, "eval_token_acc": 0.7618110236220472, "eval_runtime": 1.2395, "eval_samples_per_second": 3.227, "eval_steps_per_second": 3.227, "epoch": 5.0, "step": 990}, {"train_runtime": 2786.0397, "train_samples_per_second": 0.711, "train_steps_per_second": 0.355, "total_flos": 2.99483427739392e+17, "train_loss": 0.23823543456618232, "epoch": 5.0, "step": 990}], "memory": 132.9375} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs/events.out.tfevents.1737731402.kml-task-547024-record-9965643-prod-worker-0.48572.0 b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs/events.out.tfevents.1737731402.kml-task-547024-record-9965643-prod-worker-0.48572.0 new file mode 100644 index 0000000000000000000000000000000000000000..c29394d11bd9f88628ee7cc52c4a7dcf7b6a566b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_random20/v1-20250124-150833/runs/events.out.tfevents.1737731402.kml-task-547024-record-9965643-prod-worker-0.48572.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c12d4de4d5b5f7782b6a08a1b25f41b34f9a03bca43feac30c93750e8461ac69 +size 98883 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850/args.json new file mode 100644 index 0000000000000000000000000000000000000000..a6a59f8d07fe9299387ac02373ea82a4a3151c88 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-142850', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/args.json new file mode 100644 index 0000000000000000000000000000000000000000..c823485ef18788286bcb71fedaeff4af4a3e1e58 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/README.md b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..806047d5384c5f7ed5dfbe1eb8e39bd960a6142c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59355390f3d51a0a0d3a6ff575566a86c15b2ef6 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c3c4744bcf6195bf5c9ee6df071bf2042256fe6e7b99f74721430bd60bb8c8c +size 536991984 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/additional_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/args.json new file mode 100644 index 0000000000000000000000000000000000000000..c823485ef18788286bcb71fedaeff4af4a3e1e58 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/optimizer.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..afa5b562bfe9579a7f864f509df11e91a971dd40 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6eda4f34728b37912a52dad9acfb046b216a3c67e6ceff369dc95723da55e5 +size 1074499986 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/rng_state.pth b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a35447b29d0b26159824cc8259f0115015d566d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4244b648356be8d86df9303abb32db5c0e20957c7e4b539f31748ed5e9c0e12d +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/scheduler.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d2212ebbbb10dacbe1a1606284dbe00e474558f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ea7807968f14a7e6aa3199fa8036f47c82ff9fdd95652fbfa9a27983793616 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/trainer_state.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f7d732823733fec297f5a87622b46e283f050ff6 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/trainer_state.json @@ -0,0 +1,1513 @@ +{ + "best_metric": 0.30323198, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600", + "epoch": 3.0303030303030303, + "eval_steps": 20, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.15499143302440643, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5951199531555176, + "memory(GiB)": 71.81, + "step": 1, + "token_acc": 0.8394495412844036, + "train_speed(iter/s)": 0.207991 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.31547653675079346, + "learning_rate": 1e-05, + "loss": 0.7110069990158081, + "memory(GiB)": 81.23, + "step": 5, + "token_acc": 0.8298845043310876, + "train_speed(iter/s)": 0.34062 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.18780562281608582, + "learning_rate": 2e-05, + "loss": 0.7590272426605225, + "memory(GiB)": 87.6, + "step": 10, + "token_acc": 0.7956749880744156, + "train_speed(iter/s)": 0.36629 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.20020845532417297, + "learning_rate": 3e-05, + "loss": 0.7305656909942627, + "memory(GiB)": 98.05, + "step": 15, + "token_acc": 0.8000784518828452, + "train_speed(iter/s)": 0.368677 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 3.012465715408325, + "learning_rate": 4e-05, + "loss": 1.0381051063537599, + "memory(GiB)": 98.05, + "step": 20, + "token_acc": 0.8534532791642484, + "train_speed(iter/s)": 0.388448 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.6039576530456543, + "eval_runtime": 1.3108, + "eval_samples_per_second": 3.051, + "eval_steps_per_second": 3.051, + "eval_token_acc": 0.71751968503937, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.24263285100460052, + "learning_rate": 5e-05, + "loss": 0.534688138961792, + "memory(GiB)": 108.04, + "step": 25, + "token_acc": 0.8304161804745235, + "train_speed(iter/s)": 0.355688 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.49346253275871277, + "learning_rate": 6e-05, + "loss": 0.6209209442138672, + "memory(GiB)": 108.04, + "step": 30, + "token_acc": 0.8155661353756987, + "train_speed(iter/s)": 0.368676 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.15310031175613403, + "learning_rate": 7e-05, + "loss": 0.36601178646087645, + "memory(GiB)": 108.04, + "step": 35, + "token_acc": 0.8558015943312666, + "train_speed(iter/s)": 0.37743 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.11761368066072464, + "learning_rate": 8e-05, + "loss": 0.4429020404815674, + "memory(GiB)": 108.04, + "step": 40, + "token_acc": 0.8457292271934922, + "train_speed(iter/s)": 0.380325 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5790694355964661, + "eval_runtime": 1.3071, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 3.06, + "eval_token_acc": 0.7440944881889764, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.20896033942699432, + "learning_rate": 9e-05, + "loss": 0.4508810520172119, + "memory(GiB)": 108.04, + "step": 45, + "token_acc": 0.8364097363083164, + "train_speed(iter/s)": 0.367432 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.1467944234609604, + "learning_rate": 0.0001, + "loss": 0.503812837600708, + "memory(GiB)": 114.53, + "step": 50, + "token_acc": 0.8802865956811623, + "train_speed(iter/s)": 0.35784 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.19582347571849823, + "learning_rate": 9.999301905929286e-05, + "loss": 0.45382375717163087, + "memory(GiB)": 114.53, + "step": 55, + "token_acc": 0.8511966701352758, + "train_speed(iter/s)": 0.358299 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.22766543924808502, + "learning_rate": 9.997207818651274e-05, + "loss": 0.38322081565856936, + "memory(GiB)": 114.53, + "step": 60, + "token_acc": 0.8599060513954131, + "train_speed(iter/s)": 0.366204 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5421442985534668, + "eval_runtime": 1.3042, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 3.067, + "eval_token_acc": 0.7519685039370079, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.287585973739624, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5506976127624512, + "memory(GiB)": 114.53, + "step": 65, + "token_acc": 0.826218827229836, + "train_speed(iter/s)": 0.359199 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.11081422865390778, + "learning_rate": 9.988834393115767e-05, + "loss": 0.41149077415466306, + "memory(GiB)": 114.53, + "step": 70, + "token_acc": 0.8737335359675785, + "train_speed(iter/s)": 0.356847 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.1686583310365677, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5796836376190185, + "memory(GiB)": 114.53, + "step": 75, + "token_acc": 0.8277399056109072, + "train_speed(iter/s)": 0.363118 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.1717352271080017, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6482028961181641, + "memory(GiB)": 114.53, + "step": 80, + "token_acc": 0.8418734400234914, + "train_speed(iter/s)": 0.364963 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5349594354629517, + "eval_runtime": 1.2882, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 3.105, + "eval_token_acc": 0.7490157480314961, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.1399306207895279, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5291558265686035, + "memory(GiB)": 114.53, + "step": 85, + "token_acc": 0.8198504418762746, + "train_speed(iter/s)": 0.360238 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.3386712074279785, + "learning_rate": 9.9553874407739e-05, + "loss": 0.43833165168762206, + "memory(GiB)": 114.53, + "step": 90, + "token_acc": 0.8456410256410256, + "train_speed(iter/s)": 0.36297 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.12106972932815552, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3532155990600586, + "memory(GiB)": 114.53, + "step": 95, + "token_acc": 0.8691472093894369, + "train_speed(iter/s)": 0.36177 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1363830864429474, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4101123809814453, + "memory(GiB)": 129.08, + "step": 100, + "token_acc": 0.8671476137624862, + "train_speed(iter/s)": 0.357637 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5257760882377625, + "eval_runtime": 1.2863, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 3.11, + "eval_token_acc": 0.7588582677165354, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.1821882128715515, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5475019454956055, + "memory(GiB)": 129.08, + "step": 105, + "token_acc": 0.8072100313479624, + "train_speed(iter/s)": 0.356489 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.645659327507019, + "learning_rate": 9.899808525182935e-05, + "loss": 0.45719470977783205, + "memory(GiB)": 129.08, + "step": 110, + "token_acc": 0.8226950354609929, + "train_speed(iter/s)": 0.362336 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.09764645248651505, + "learning_rate": 9.882482608435923e-05, + "loss": 0.44896726608276366, + "memory(GiB)": 129.08, + "step": 115, + "token_acc": 0.8672267372842662, + "train_speed(iter/s)": 0.357594 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.3066820800304413, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5043072700500488, + "memory(GiB)": 129.08, + "step": 120, + "token_acc": 0.8138049619258167, + "train_speed(iter/s)": 0.357072 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.519137442111969, + "eval_runtime": 1.2824, + "eval_samples_per_second": 3.119, + "eval_steps_per_second": 3.119, + "eval_token_acc": 0.7618110236220472, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.20023687183856964, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4765754222869873, + "memory(GiB)": 129.08, + "step": 125, + "token_acc": 0.8370143478961792, + "train_speed(iter/s)": 0.356444 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.21041052043437958, + "learning_rate": 9.822345875271883e-05, + "loss": 0.4863614082336426, + "memory(GiB)": 129.08, + "step": 130, + "token_acc": 0.8408579215546865, + "train_speed(iter/s)": 0.356648 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.10671097040176392, + "learning_rate": 9.799599295015154e-05, + "loss": 0.36075942516326903, + "memory(GiB)": 129.08, + "step": 135, + "token_acc": 0.8704587642535137, + "train_speed(iter/s)": 0.356657 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.21016854047775269, + "learning_rate": 9.775512486034563e-05, + "loss": 0.533957576751709, + "memory(GiB)": 129.08, + "step": 140, + "token_acc": 0.827848754678023, + "train_speed(iter/s)": 0.356881 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.47979286313056946, + "eval_runtime": 1.306, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 3.063, + "eval_token_acc": 0.7578740157480315, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.21023879945278168, + "learning_rate": 9.750092174273521e-05, + "loss": 0.36088201999664304, + "memory(GiB)": 129.08, + "step": 145, + "token_acc": 0.857113273969766, + "train_speed(iter/s)": 0.353305 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.4503144919872284, + "learning_rate": 9.723345458039594e-05, + "loss": 0.3751711130142212, + "memory(GiB)": 129.08, + "step": 150, + "token_acc": 0.8822255538897218, + "train_speed(iter/s)": 0.354538 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.23807577788829803, + "learning_rate": 9.69527980602239e-05, + "loss": 0.41892757415771487, + "memory(GiB)": 129.08, + "step": 155, + "token_acc": 0.8584016644229593, + "train_speed(iter/s)": 0.354578 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.19749346375465393, + "learning_rate": 9.665903055208014e-05, + "loss": 0.339878511428833, + "memory(GiB)": 129.08, + "step": 160, + "token_acc": 0.8817881788178817, + "train_speed(iter/s)": 0.356324 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.46302199363708496, + "eval_runtime": 1.2753, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 3.137, + "eval_token_acc": 0.7559055118110236, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1409613937139511, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4177990436553955, + "memory(GiB)": 129.08, + "step": 165, + "token_acc": 0.8499143497166952, + "train_speed(iter/s)": 0.354285 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.20421652495861053, + "learning_rate": 9.603249433382144e-05, + "loss": 0.45575361251831054, + "memory(GiB)": 129.08, + "step": 170, + "token_acc": 0.8524418908331157, + "train_speed(iter/s)": 0.35386 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.2770065367221832, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4119880199432373, + "memory(GiB)": 129.08, + "step": 175, + "token_acc": 0.8568082970893275, + "train_speed(iter/s)": 0.35561 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.19533886015415192, + "learning_rate": 9.535454568671704e-05, + "loss": 0.41430139541625977, + "memory(GiB)": 129.1, + "step": 180, + "token_acc": 0.8622650450165851, + "train_speed(iter/s)": 0.356495 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.47494348883628845, + "eval_runtime": 1.271, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 3.147, + "eval_token_acc": 0.765748031496063, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.258792519569397, + "learning_rate": 9.49965261014704e-05, + "loss": 0.49581570625305177, + "memory(GiB)": 129.1, + "step": 185, + "token_acc": 0.817189460476788, + "train_speed(iter/s)": 0.355245 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.9851603507995605, + "learning_rate": 9.462594179299406e-05, + "loss": 0.5660634517669678, + "memory(GiB)": 129.1, + "step": 190, + "token_acc": 0.8010047446274072, + "train_speed(iter/s)": 0.357613 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.15361513197422028, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5602671146392822, + "memory(GiB)": 129.1, + "step": 195, + "token_acc": 0.8152331953920143, + "train_speed(iter/s)": 0.357337 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.1556633859872818, + "learning_rate": 9.384749641033359e-05, + "loss": 0.5035863399505616, + "memory(GiB)": 129.1, + "step": 200, + "token_acc": 0.8482374484968717, + "train_speed(iter/s)": 0.353905 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4684031009674072, + "eval_runtime": 1.2829, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "eval_token_acc": 0.764763779527559, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.16396360099315643, + "learning_rate": 9.343985270739182e-05, + "loss": 0.4510298728942871, + "memory(GiB)": 129.1, + "step": 205, + "token_acc": 0.8351512146752603, + "train_speed(iter/s)": 0.350077 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.20765098929405212, + "learning_rate": 9.302007896300698e-05, + "loss": 0.4226827621459961, + "memory(GiB)": 129.1, + "step": 210, + "token_acc": 0.8615891614793116, + "train_speed(iter/s)": 0.350194 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.17130540311336517, + "learning_rate": 9.25882923938038e-05, + "loss": 0.38046865463256835, + "memory(GiB)": 129.1, + "step": 215, + "token_acc": 0.8746039856923863, + "train_speed(iter/s)": 0.349574 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.31240323185920715, + "learning_rate": 9.214461357083985e-05, + "loss": 0.35848026275634765, + "memory(GiB)": 129.1, + "step": 220, + "token_acc": 0.8796054540179866, + "train_speed(iter/s)": 0.350614 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4786238670349121, + "eval_runtime": 1.265, + "eval_samples_per_second": 3.162, + "eval_steps_per_second": 3.162, + "eval_token_acc": 0.7549212598425197, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.3606383800506592, + "learning_rate": 9.168916638593736e-05, + "loss": 0.48271026611328127, + "memory(GiB)": 129.1, + "step": 225, + "token_acc": 0.8429425702358118, + "train_speed(iter/s)": 0.349116 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.2114563286304474, + "learning_rate": 9.122207801708802e-05, + "loss": 0.39199128150939944, + "memory(GiB)": 129.1, + "step": 230, + "token_acc": 0.8686677560849746, + "train_speed(iter/s)": 0.347125 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.24240955710411072, + "learning_rate": 9.074347889294016e-05, + "loss": 0.2028397798538208, + "memory(GiB)": 129.1, + "step": 235, + "token_acc": 0.908675799086758, + "train_speed(iter/s)": 0.34928 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.4021816551685333, + "learning_rate": 9.025350265637815e-05, + "loss": 0.44605064392089844, + "memory(GiB)": 129.1, + "step": 240, + "token_acc": 0.853990914990266, + "train_speed(iter/s)": 0.350502 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.477361261844635, + "eval_runtime": 1.2844, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 3.114, + "eval_token_acc": 0.7627952755905512, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.2472194880247116, + "learning_rate": 8.975228612720416e-05, + "loss": 0.28942854404449464, + "memory(GiB)": 129.1, + "step": 245, + "token_acc": 0.8561964776215867, + "train_speed(iter/s)": 0.35052 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.49620357155799866, + "learning_rate": 8.923996926393305e-05, + "loss": 0.4217637062072754, + "memory(GiB)": 129.1, + "step": 250, + "token_acc": 0.8497478099283249, + "train_speed(iter/s)": 0.352588 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.2509404122829437, + "learning_rate": 8.871669512471068e-05, + "loss": 0.3790408134460449, + "memory(GiB)": 129.1, + "step": 255, + "token_acc": 0.859628239172237, + "train_speed(iter/s)": 0.351148 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.39996537566185, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3863436222076416, + "memory(GiB)": 129.1, + "step": 260, + "token_acc": 0.855022437003797, + "train_speed(iter/s)": 0.352259 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.43004661798477173, + "eval_runtime": 1.2631, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 3.167, + "eval_token_acc": 0.7706692913385826, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.13070876896381378, + "learning_rate": 8.763786250861256e-05, + "loss": 0.30726191997528074, + "memory(GiB)": 129.1, + "step": 265, + "token_acc": 0.8950704812745016, + "train_speed(iter/s)": 0.348668 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.29351168870925903, + "learning_rate": 8.708260528239788e-05, + "loss": 0.28099467754364016, + "memory(GiB)": 129.1, + "step": 270, + "token_acc": 0.8945386064030132, + "train_speed(iter/s)": 0.349732 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.29749786853790283, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2923673152923584, + "memory(GiB)": 129.1, + "step": 275, + "token_acc": 0.8954248366013072, + "train_speed(iter/s)": 0.349425 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.5873435139656067, + "learning_rate": 8.594118419389647e-05, + "loss": 0.42062225341796877, + "memory(GiB)": 129.1, + "step": 280, + "token_acc": 0.8647863247863248, + "train_speed(iter/s)": 0.350425 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.4043884575366974, + "eval_runtime": 1.2688, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 3.153, + "eval_token_acc": 0.7726377952755905, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.5294097065925598, + "learning_rate": 8.535533905932738e-05, + "loss": 0.23297641277313233, + "memory(GiB)": 129.1, + "step": 285, + "token_acc": 0.8899554336647241, + "train_speed(iter/s)": 0.350232 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.4506111741065979, + "learning_rate": 8.475962138373213e-05, + "loss": 0.36497814655303956, + "memory(GiB)": 129.1, + "step": 290, + "token_acc": 0.8623964437260052, + "train_speed(iter/s)": 0.351651 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.30804958939552307, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3962693214416504, + "memory(GiB)": 129.1, + "step": 295, + "token_acc": 0.85288089273514, + "train_speed(iter/s)": 0.353152 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.6068087816238403, + "learning_rate": 8.353923650696118e-05, + "loss": 0.3206871509552002, + "memory(GiB)": 129.1, + "step": 300, + "token_acc": 0.8861493836113126, + "train_speed(iter/s)": 0.353426 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.42214763164520264, + "eval_runtime": 1.2774, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 3.131, + "eval_token_acc": 0.7775590551181102, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.3600684404373169, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3652678966522217, + "memory(GiB)": 129.1, + "step": 305, + "token_acc": 0.8686586614539701, + "train_speed(iter/s)": 0.350995 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.33816397190093994, + "learning_rate": 8.228139257794012e-05, + "loss": 0.30328705310821535, + "memory(GiB)": 129.1, + "step": 310, + "token_acc": 0.8997613365155132, + "train_speed(iter/s)": 0.352277 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.849923849105835, + "learning_rate": 8.163886089321493e-05, + "loss": 0.32169332504272463, + "memory(GiB)": 129.1, + "step": 315, + "token_acc": 0.8734251968503937, + "train_speed(iter/s)": 0.353612 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.37628281116485596, + "learning_rate": 8.098749444801224e-05, + "loss": 0.33423264026641847, + "memory(GiB)": 129.1, + "step": 320, + "token_acc": 0.9003220364974697, + "train_speed(iter/s)": 0.354058 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.4008274972438812, + "eval_runtime": 1.2671, + "eval_samples_per_second": 3.157, + "eval_steps_per_second": 3.157, + "eval_token_acc": 0.7726377952755905, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.37696319818496704, + "learning_rate": 8.032747512835337e-05, + "loss": 0.35271801948547366, + "memory(GiB)": 129.1, + "step": 325, + "token_acc": 0.8523900054318305, + "train_speed(iter/s)": 0.353524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.574674129486084, + "learning_rate": 7.965898723646776e-05, + "loss": 0.38188652992248534, + "memory(GiB)": 129.1, + "step": 330, + "token_acc": 0.8839562254800744, + "train_speed(iter/s)": 0.35462 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.31084558367729187, + "learning_rate": 7.898221743932888e-05, + "loss": 0.38109359741210935, + "memory(GiB)": 129.1, + "step": 335, + "token_acc": 0.8710840033268644, + "train_speed(iter/s)": 0.354627 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.7837309837341309, + "learning_rate": 7.829735471652978e-05, + "loss": 0.26326937675476075, + "memory(GiB)": 129.1, + "step": 340, + "token_acc": 0.9064136125654451, + "train_speed(iter/s)": 0.355662 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.3601945638656616, + "eval_runtime": 1.2553, + "eval_samples_per_second": 3.186, + "eval_steps_per_second": 3.186, + "eval_token_acc": 0.7726377952755905, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.326943576335907, + "learning_rate": 7.760459030751284e-05, + "loss": 0.2819732904434204, + "memory(GiB)": 129.1, + "step": 345, + "token_acc": 0.894602905312268, + "train_speed(iter/s)": 0.35432 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.0278775691986084, + "learning_rate": 7.690411765816864e-05, + "loss": 0.16738426685333252, + "memory(GiB)": 129.1, + "step": 350, + "token_acc": 0.9215094339622641, + "train_speed(iter/s)": 0.355229 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.6167912483215332, + "learning_rate": 7.619613236681843e-05, + "loss": 0.4713289260864258, + "memory(GiB)": 129.1, + "step": 355, + "token_acc": 0.8477546549835706, + "train_speed(iter/s)": 0.355466 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.26643773913383484, + "learning_rate": 7.548083212959588e-05, + "loss": 0.259800124168396, + "memory(GiB)": 129.1, + "step": 360, + "token_acc": 0.8968109615617801, + "train_speed(iter/s)": 0.355753 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.37819036841392517, + "eval_runtime": 1.3003, + "eval_samples_per_second": 3.076, + "eval_steps_per_second": 3.076, + "eval_token_acc": 0.7785433070866141, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.709635853767395, + "learning_rate": 7.475841668524268e-05, + "loss": 0.3847909212112427, + "memory(GiB)": 129.1, + "step": 365, + "token_acc": 0.8547150949683439, + "train_speed(iter/s)": 0.355576 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.28540483117103577, + "learning_rate": 7.402908775933419e-05, + "loss": 0.35011069774627684, + "memory(GiB)": 129.1, + "step": 370, + "token_acc": 0.8763573543928924, + "train_speed(iter/s)": 0.355573 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.4313144385814667, + "learning_rate": 7.329304900794991e-05, + "loss": 0.4088496208190918, + "memory(GiB)": 129.1, + "step": 375, + "token_acc": 0.8652825291966497, + "train_speed(iter/s)": 0.355232 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.6075259447097778, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3548138618469238, + "memory(GiB)": 129.1, + "step": 380, + "token_acc": 0.8879879054425509, + "train_speed(iter/s)": 0.35565 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.373136043548584, + "eval_runtime": 1.2667, + "eval_samples_per_second": 3.158, + "eval_steps_per_second": 3.158, + "eval_token_acc": 0.7706692913385826, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.1639028936624527, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3757177352905273, + "memory(GiB)": 129.1, + "step": 385, + "token_acc": 0.882828778036524, + "train_speed(iter/s)": 0.354333 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.32813289761543274, + "learning_rate": 7.104673812141675e-05, + "loss": 0.25887558460235593, + "memory(GiB)": 129.1, + "step": 390, + "token_acc": 0.8980108083247097, + "train_speed(iter/s)": 0.353935 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3131479322910309, + "learning_rate": 7.02859332377382e-05, + "loss": 0.254361891746521, + "memory(GiB)": 129.1, + "step": 395, + "token_acc": 0.9006518318723309, + "train_speed(iter/s)": 0.354813 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5802826881408691, + "learning_rate": 6.951946375817474e-05, + "loss": 0.22614221572875975, + "memory(GiB)": 129.1, + "step": 400, + "token_acc": 0.9400690304361469, + "train_speed(iter/s)": 0.356043 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3536229729652405, + "eval_runtime": 1.2885, + "eval_samples_per_second": 3.104, + "eval_steps_per_second": 3.104, + "eval_token_acc": 0.7706692913385826, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6191949844360352, + "learning_rate": 6.874754370984606e-05, + "loss": 0.15614408254623413, + "memory(GiB)": 129.1, + "step": 405, + "token_acc": 0.9098951953178168, + "train_speed(iter/s)": 0.355365 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.7725083231925964, + "learning_rate": 6.797038864187564e-05, + "loss": 0.2733434200286865, + "memory(GiB)": 129.1, + "step": 410, + "token_acc": 0.9188966652943599, + "train_speed(iter/s)": 0.355292 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.3713182210922241, + "learning_rate": 6.718821556520151e-05, + "loss": 0.1949324369430542, + "memory(GiB)": 129.1, + "step": 415, + "token_acc": 0.9226980728051392, + "train_speed(iter/s)": 0.355986 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.2845621109008789, + "learning_rate": 6.640124289197845e-05, + "loss": 0.09827777743339539, + "memory(GiB)": 129.1, + "step": 420, + "token_acc": 0.9717413441955194, + "train_speed(iter/s)": 0.356829 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.3582577109336853, + "eval_runtime": 1.2794, + "eval_samples_per_second": 3.126, + "eval_steps_per_second": 3.126, + "eval_token_acc": 0.7667322834645669, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.607758641242981, + "learning_rate": 6.560969037458933e-05, + "loss": 0.19055347442626952, + "memory(GiB)": 129.1, + "step": 425, + "token_acc": 0.8919627256843331, + "train_speed(iter/s)": 0.356579 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.4930749237537384, + "learning_rate": 6.481377904428171e-05, + "loss": 0.1357766032218933, + "memory(GiB)": 129.1, + "step": 430, + "token_acc": 0.9581637268204433, + "train_speed(iter/s)": 0.356034 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.2689533531665802, + "learning_rate": 6.401373114944781e-05, + "loss": 0.18788766860961914, + "memory(GiB)": 129.1, + "step": 435, + "token_acc": 0.9421218961625282, + "train_speed(iter/s)": 0.355275 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.9979881644248962, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2125793695449829, + "memory(GiB)": 129.1, + "step": 440, + "token_acc": 0.9272427983539094, + "train_speed(iter/s)": 0.355873 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.3688502013683319, + "eval_runtime": 1.2802, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 3.124, + "eval_token_acc": 0.7706692913385826, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.37610238790512085, + "learning_rate": 6.240212037280966e-05, + "loss": 0.11968926191329957, + "memory(GiB)": 129.1, + "step": 445, + "token_acc": 0.9304094308530866, + "train_speed(iter/s)": 0.355188 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7159104943275452, + "learning_rate": 6.159100751337642e-05, + "loss": 0.26689648628234863, + "memory(GiB)": 129.1, + "step": 450, + "token_acc": 0.9058954807513483, + "train_speed(iter/s)": 0.355771 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.29641231894493103, + "learning_rate": 6.077665800849568e-05, + "loss": 0.1854721188545227, + "memory(GiB)": 129.1, + "step": 455, + "token_acc": 0.9324742268041237, + "train_speed(iter/s)": 0.355991 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.25186142325401306, + "learning_rate": 5.99592992551918e-05, + "loss": 0.193935763835907, + "memory(GiB)": 129.1, + "step": 460, + "token_acc": 0.9377682403433476, + "train_speed(iter/s)": 0.356327 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3633354604244232, + "eval_runtime": 1.2615, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 3.171, + "eval_token_acc": 0.7706692913385826, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.49181655049324036, + "learning_rate": 5.913915949078452e-05, + "loss": 0.176645827293396, + "memory(GiB)": 129.1, + "step": 465, + "token_acc": 0.8921251348435815, + "train_speed(iter/s)": 0.356557 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.5645484328269958, + "learning_rate": 5.831646772915651e-05, + "loss": 0.13740575313568115, + "memory(GiB)": 129.1, + "step": 470, + "token_acc": 0.9428538968416269, + "train_speed(iter/s)": 0.356147 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 1.3345602750778198, + "learning_rate": 5.749145369680407e-05, + "loss": 0.21261224746704102, + "memory(GiB)": 129.1, + "step": 475, + "token_acc": 0.9206197398622801, + "train_speed(iter/s)": 0.356639 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.3252560496330261, + "learning_rate": 5.666434776868895e-05, + "loss": 0.2075648546218872, + "memory(GiB)": 129.1, + "step": 480, + "token_acc": 0.9252907219944784, + "train_speed(iter/s)": 0.355528 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.36280357837677, + "eval_runtime": 1.2645, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7667322834645669, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.6244832277297974, + "learning_rate": 5.583538090390882e-05, + "loss": 0.13927946090698243, + "memory(GiB)": 129.1, + "step": 485, + "token_acc": 0.8984392671341326, + "train_speed(iter/s)": 0.355772 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5457295179367065, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.28371801376342776, + "memory(GiB)": 129.1, + "step": 490, + "token_acc": 0.8928283642224013, + "train_speed(iter/s)": 0.355994 + }, + { + "epoch": 2.5, + "grad_norm": 0.26068228483200073, + "learning_rate": 5.41727907343245e-05, + "loss": 0.16324831247329713, + "memory(GiB)": 129.1, + "step": 495, + "token_acc": 0.9382183908045977, + "train_speed(iter/s)": 0.356737 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.4725530445575714, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.23240807056427001, + "memory(GiB)": 129.1, + "step": 500, + "token_acc": 0.8918985471558729, + "train_speed(iter/s)": 0.356548 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.36516159772872925, + "eval_runtime": 1.2784, + "eval_samples_per_second": 3.129, + "eval_steps_per_second": 3.129, + "eval_token_acc": 0.7706692913385826, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.5990637540817261, + "learning_rate": 5.250554008935596e-05, + "loss": 0.15861610174179078, + "memory(GiB)": 129.1, + "step": 505, + "token_acc": 0.9060025910464949, + "train_speed(iter/s)": 0.356161 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.7070275545120239, + "learning_rate": 5.167074885038373e-05, + "loss": 0.16548032760620118, + "memory(GiB)": 129.1, + "step": 510, + "token_acc": 0.9370354175776126, + "train_speed(iter/s)": 0.357031 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.30997011065483093, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.18845115900039672, + "memory(GiB)": 129.1, + "step": 515, + "token_acc": 0.9271042471042471, + "train_speed(iter/s)": 0.357488 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.34516477584838867, + "learning_rate": 5e-05, + "loss": 0.19470884799957275, + "memory(GiB)": 129.1, + "step": 520, + "token_acc": 0.9188622362039586, + "train_speed(iter/s)": 0.357567 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.35303670167922974, + "eval_runtime": 1.2716, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 3.146, + "eval_token_acc": 0.765748031496063, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.400846928358078, + "learning_rate": 4.916450892453495e-05, + "loss": 0.16326183080673218, + "memory(GiB)": 129.1, + "step": 525, + "token_acc": 0.9170015455950541, + "train_speed(iter/s)": 0.357449 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3526351749897003, + "learning_rate": 4.832925114961629e-05, + "loss": 0.2275157690048218, + "memory(GiB)": 129.1, + "step": 530, + "token_acc": 0.911013136584488, + "train_speed(iter/s)": 0.356693 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7750332355499268, + "learning_rate": 4.749445991064404e-05, + "loss": 0.16660224199295043, + "memory(GiB)": 129.1, + "step": 535, + "token_acc": 0.9465856041689285, + "train_speed(iter/s)": 0.356093 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.008236289024353, + "learning_rate": 4.666036831274392e-05, + "loss": 0.29327480792999266, + "memory(GiB)": 129.1, + "step": 540, + "token_acc": 0.893456980937661, + "train_speed(iter/s)": 0.355675 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.34143465757369995, + "eval_runtime": 1.2634, + "eval_samples_per_second": 3.166, + "eval_steps_per_second": 3.166, + "eval_token_acc": 0.764763779527559, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.5062530040740967, + "learning_rate": 4.582720926567552e-05, + "loss": 0.221860933303833, + "memory(GiB)": 129.1, + "step": 545, + "token_acc": 0.8882824294507026, + "train_speed(iter/s)": 0.355094 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4163118004798889, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.171502685546875, + "memory(GiB)": 129.1, + "step": 550, + "token_acc": 0.9311728853872454, + "train_speed(iter/s)": 0.355204 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.6872218251228333, + "learning_rate": 4.416461909609119e-05, + "loss": 0.18130356073379517, + "memory(GiB)": 129.1, + "step": 555, + "token_acc": 0.9372671732975711, + "train_speed(iter/s)": 0.35566 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.2589365839958191, + "learning_rate": 4.333565223131107e-05, + "loss": 0.15754028558731079, + "memory(GiB)": 129.1, + "step": 560, + "token_acc": 0.9293805736322005, + "train_speed(iter/s)": 0.355647 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.328652024269104, + "eval_runtime": 1.2642, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.764763779527559, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.6874573230743408, + "learning_rate": 4.250854630319593e-05, + "loss": 0.2055502414703369, + "memory(GiB)": 129.1, + "step": 565, + "token_acc": 0.9031949899161448, + "train_speed(iter/s)": 0.355089 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6083143949508667, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.25478286743164064, + "memory(GiB)": 129.1, + "step": 570, + "token_acc": 0.903887358432813, + "train_speed(iter/s)": 0.354778 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.9529440999031067, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.17642589807510375, + "memory(GiB)": 129.1, + "step": 575, + "token_acc": 0.9331498230436492, + "train_speed(iter/s)": 0.354847 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 0.17912031710147858, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.24578819274902344, + "memory(GiB)": 129.1, + "step": 580, + "token_acc": 0.8957880166106387, + "train_speed(iter/s)": 0.355458 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.32831043004989624, + "eval_runtime": 1.2645, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7706692913385826, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.7658194303512573, + "learning_rate": 3.922334199150432e-05, + "loss": 0.21303670406341552, + "memory(GiB)": 129.1, + "step": 585, + "token_acc": 0.8920780711825488, + "train_speed(iter/s)": 0.355426 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.7090197801589966, + "learning_rate": 3.840899248662358e-05, + "loss": 0.19972538948059082, + "memory(GiB)": 129.1, + "step": 590, + "token_acc": 0.9341101694915255, + "train_speed(iter/s)": 0.354892 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.26023608446121216, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.2259267807006836, + "memory(GiB)": 129.1, + "step": 595, + "token_acc": 0.9335699797160243, + "train_speed(iter/s)": 0.354377 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.8862583041191101, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.13799512386322021, + "memory(GiB)": 129.1, + "step": 600, + "token_acc": 0.957043945174509, + "train_speed(iter/s)": 0.354572 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.30323198437690735, + "eval_runtime": 1.2806, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 3.123, + "eval_token_acc": 0.7706692913385826, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.798916387913216e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/training_args.bin b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..07b2e49761b5734cbcf2c2a471961ee74cc1f1e3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6c9cb0acbab470bfb75bbb0e550391c75364a947acbeffea57da7bacf0844ba +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/README.md b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..806047d5384c5f7ed5dfbe1eb8e39bd960a6142c --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..580b03155697da7f73712a4c3ee73e1be97dde78 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dd49befc9361dd92951a586944e43c06d6ee00a06e0e0e99c82fdbb3a659154 +size 536991984 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/additional_config.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/args.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/args.json new file mode 100644 index 0000000000000000000000000000000000000000..c823485ef18788286bcb71fedaeff4af4a3e1e58 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/optimizer.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ccde0ccdacd163b8fe60846f455026508c2cf3b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b388d1d449d83f6873a11b86ee73cbf958fef082fbbb24cae67e515b48f6b5f6 +size 1074499986 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/rng_state.pth b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f578c96e0c3618c3bbb281b4b909b1fd59d06cd4 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3070cc1acc97b08923660869fc07bc6f52570615a6cd9b2a82b902c53159d6 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/scheduler.pt b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f8882c16fb0abc091aaea5286781182c084d87d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b3a16451354ac84ec594942621c3011b01d575ac8a6b2fa4481b0291c904a7 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/trainer_state.json b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9bfe15667c8d05a993163982c582971834a591e3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/trainer_state.json @@ -0,0 +1,2473 @@ +{ + "best_metric": 0.30323198, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 990, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.15499143302440643, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5951199531555176, + "memory(GiB)": 71.81, + "step": 1, + "token_acc": 0.8394495412844036, + "train_speed(iter/s)": 0.207991 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.31547653675079346, + "learning_rate": 1e-05, + "loss": 0.7110069990158081, + "memory(GiB)": 81.23, + "step": 5, + "token_acc": 0.8298845043310876, + "train_speed(iter/s)": 0.34062 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.18780562281608582, + "learning_rate": 2e-05, + "loss": 0.7590272426605225, + "memory(GiB)": 87.6, + "step": 10, + "token_acc": 0.7956749880744156, + "train_speed(iter/s)": 0.36629 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.20020845532417297, + "learning_rate": 3e-05, + "loss": 0.7305656909942627, + "memory(GiB)": 98.05, + "step": 15, + "token_acc": 0.8000784518828452, + "train_speed(iter/s)": 0.368677 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 3.012465715408325, + "learning_rate": 4e-05, + "loss": 1.0381051063537599, + "memory(GiB)": 98.05, + "step": 20, + "token_acc": 0.8534532791642484, + "train_speed(iter/s)": 0.388448 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.6039576530456543, + "eval_runtime": 1.3108, + "eval_samples_per_second": 3.051, + "eval_steps_per_second": 3.051, + "eval_token_acc": 0.71751968503937, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.24263285100460052, + "learning_rate": 5e-05, + "loss": 0.534688138961792, + "memory(GiB)": 108.04, + "step": 25, + "token_acc": 0.8304161804745235, + "train_speed(iter/s)": 0.355688 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.49346253275871277, + "learning_rate": 6e-05, + "loss": 0.6209209442138672, + "memory(GiB)": 108.04, + "step": 30, + "token_acc": 0.8155661353756987, + "train_speed(iter/s)": 0.368676 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.15310031175613403, + "learning_rate": 7e-05, + "loss": 0.36601178646087645, + "memory(GiB)": 108.04, + "step": 35, + "token_acc": 0.8558015943312666, + "train_speed(iter/s)": 0.37743 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.11761368066072464, + "learning_rate": 8e-05, + "loss": 0.4429020404815674, + "memory(GiB)": 108.04, + "step": 40, + "token_acc": 0.8457292271934922, + "train_speed(iter/s)": 0.380325 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5790694355964661, + "eval_runtime": 1.3071, + "eval_samples_per_second": 3.06, + "eval_steps_per_second": 3.06, + "eval_token_acc": 0.7440944881889764, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.20896033942699432, + "learning_rate": 9e-05, + "loss": 0.4508810520172119, + "memory(GiB)": 108.04, + "step": 45, + "token_acc": 0.8364097363083164, + "train_speed(iter/s)": 0.367432 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.1467944234609604, + "learning_rate": 0.0001, + "loss": 0.503812837600708, + "memory(GiB)": 114.53, + "step": 50, + "token_acc": 0.8802865956811623, + "train_speed(iter/s)": 0.35784 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.19582347571849823, + "learning_rate": 9.999301905929286e-05, + "loss": 0.45382375717163087, + "memory(GiB)": 114.53, + "step": 55, + "token_acc": 0.8511966701352758, + "train_speed(iter/s)": 0.358299 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.22766543924808502, + "learning_rate": 9.997207818651274e-05, + "loss": 0.38322081565856936, + "memory(GiB)": 114.53, + "step": 60, + "token_acc": 0.8599060513954131, + "train_speed(iter/s)": 0.366204 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5421442985534668, + "eval_runtime": 1.3042, + "eval_samples_per_second": 3.067, + "eval_steps_per_second": 3.067, + "eval_token_acc": 0.7519685039370079, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.287585973739624, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5506976127624512, + "memory(GiB)": 114.53, + "step": 65, + "token_acc": 0.826218827229836, + "train_speed(iter/s)": 0.359199 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.11081422865390778, + "learning_rate": 9.988834393115767e-05, + "loss": 0.41149077415466306, + "memory(GiB)": 114.53, + "step": 70, + "token_acc": 0.8737335359675785, + "train_speed(iter/s)": 0.356847 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.1686583310365677, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5796836376190185, + "memory(GiB)": 114.53, + "step": 75, + "token_acc": 0.8277399056109072, + "train_speed(iter/s)": 0.363118 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.1717352271080017, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6482028961181641, + "memory(GiB)": 114.53, + "step": 80, + "token_acc": 0.8418734400234914, + "train_speed(iter/s)": 0.364963 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5349594354629517, + "eval_runtime": 1.2882, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 3.105, + "eval_token_acc": 0.7490157480314961, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.1399306207895279, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5291558265686035, + "memory(GiB)": 114.53, + "step": 85, + "token_acc": 0.8198504418762746, + "train_speed(iter/s)": 0.360238 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.3386712074279785, + "learning_rate": 9.9553874407739e-05, + "loss": 0.43833165168762206, + "memory(GiB)": 114.53, + "step": 90, + "token_acc": 0.8456410256410256, + "train_speed(iter/s)": 0.36297 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.12106972932815552, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3532155990600586, + "memory(GiB)": 114.53, + "step": 95, + "token_acc": 0.8691472093894369, + "train_speed(iter/s)": 0.36177 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.1363830864429474, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4101123809814453, + "memory(GiB)": 129.08, + "step": 100, + "token_acc": 0.8671476137624862, + "train_speed(iter/s)": 0.357637 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5257760882377625, + "eval_runtime": 1.2863, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 3.11, + "eval_token_acc": 0.7588582677165354, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.1821882128715515, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5475019454956055, + "memory(GiB)": 129.08, + "step": 105, + "token_acc": 0.8072100313479624, + "train_speed(iter/s)": 0.356489 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.645659327507019, + "learning_rate": 9.899808525182935e-05, + "loss": 0.45719470977783205, + "memory(GiB)": 129.08, + "step": 110, + "token_acc": 0.8226950354609929, + "train_speed(iter/s)": 0.362336 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.09764645248651505, + "learning_rate": 9.882482608435923e-05, + "loss": 0.44896726608276366, + "memory(GiB)": 129.08, + "step": 115, + "token_acc": 0.8672267372842662, + "train_speed(iter/s)": 0.357594 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.3066820800304413, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5043072700500488, + "memory(GiB)": 129.08, + "step": 120, + "token_acc": 0.8138049619258167, + "train_speed(iter/s)": 0.357072 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.519137442111969, + "eval_runtime": 1.2824, + "eval_samples_per_second": 3.119, + "eval_steps_per_second": 3.119, + "eval_token_acc": 0.7618110236220472, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.20023687183856964, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4765754222869873, + "memory(GiB)": 129.08, + "step": 125, + "token_acc": 0.8370143478961792, + "train_speed(iter/s)": 0.356444 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.21041052043437958, + "learning_rate": 9.822345875271883e-05, + "loss": 0.4863614082336426, + "memory(GiB)": 129.08, + "step": 130, + "token_acc": 0.8408579215546865, + "train_speed(iter/s)": 0.356648 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.10671097040176392, + "learning_rate": 9.799599295015154e-05, + "loss": 0.36075942516326903, + "memory(GiB)": 129.08, + "step": 135, + "token_acc": 0.8704587642535137, + "train_speed(iter/s)": 0.356657 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.21016854047775269, + "learning_rate": 9.775512486034563e-05, + "loss": 0.533957576751709, + "memory(GiB)": 129.08, + "step": 140, + "token_acc": 0.827848754678023, + "train_speed(iter/s)": 0.356881 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.47979286313056946, + "eval_runtime": 1.306, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 3.063, + "eval_token_acc": 0.7578740157480315, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.21023879945278168, + "learning_rate": 9.750092174273521e-05, + "loss": 0.36088201999664304, + "memory(GiB)": 129.08, + "step": 145, + "token_acc": 0.857113273969766, + "train_speed(iter/s)": 0.353305 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.4503144919872284, + "learning_rate": 9.723345458039594e-05, + "loss": 0.3751711130142212, + "memory(GiB)": 129.08, + "step": 150, + "token_acc": 0.8822255538897218, + "train_speed(iter/s)": 0.354538 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.23807577788829803, + "learning_rate": 9.69527980602239e-05, + "loss": 0.41892757415771487, + "memory(GiB)": 129.08, + "step": 155, + "token_acc": 0.8584016644229593, + "train_speed(iter/s)": 0.354578 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.19749346375465393, + "learning_rate": 9.665903055208014e-05, + "loss": 0.339878511428833, + "memory(GiB)": 129.08, + "step": 160, + "token_acc": 0.8817881788178817, + "train_speed(iter/s)": 0.356324 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.46302199363708496, + "eval_runtime": 1.2753, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 3.137, + "eval_token_acc": 0.7559055118110236, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1409613937139511, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4177990436553955, + "memory(GiB)": 129.08, + "step": 165, + "token_acc": 0.8499143497166952, + "train_speed(iter/s)": 0.354285 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.20421652495861053, + "learning_rate": 9.603249433382144e-05, + "loss": 0.45575361251831054, + "memory(GiB)": 129.08, + "step": 170, + "token_acc": 0.8524418908331157, + "train_speed(iter/s)": 0.35386 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.2770065367221832, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4119880199432373, + "memory(GiB)": 129.08, + "step": 175, + "token_acc": 0.8568082970893275, + "train_speed(iter/s)": 0.35561 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.19533886015415192, + "learning_rate": 9.535454568671704e-05, + "loss": 0.41430139541625977, + "memory(GiB)": 129.1, + "step": 180, + "token_acc": 0.8622650450165851, + "train_speed(iter/s)": 0.356495 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.47494348883628845, + "eval_runtime": 1.271, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 3.147, + "eval_token_acc": 0.765748031496063, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.258792519569397, + "learning_rate": 9.49965261014704e-05, + "loss": 0.49581570625305177, + "memory(GiB)": 129.1, + "step": 185, + "token_acc": 0.817189460476788, + "train_speed(iter/s)": 0.355245 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.9851603507995605, + "learning_rate": 9.462594179299406e-05, + "loss": 0.5660634517669678, + "memory(GiB)": 129.1, + "step": 190, + "token_acc": 0.8010047446274072, + "train_speed(iter/s)": 0.357613 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.15361513197422028, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5602671146392822, + "memory(GiB)": 129.1, + "step": 195, + "token_acc": 0.8152331953920143, + "train_speed(iter/s)": 0.357337 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.1556633859872818, + "learning_rate": 9.384749641033359e-05, + "loss": 0.5035863399505616, + "memory(GiB)": 129.1, + "step": 200, + "token_acc": 0.8482374484968717, + "train_speed(iter/s)": 0.353905 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4684031009674072, + "eval_runtime": 1.2829, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "eval_token_acc": 0.764763779527559, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.16396360099315643, + "learning_rate": 9.343985270739182e-05, + "loss": 0.4510298728942871, + "memory(GiB)": 129.1, + "step": 205, + "token_acc": 0.8351512146752603, + "train_speed(iter/s)": 0.350077 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.20765098929405212, + "learning_rate": 9.302007896300698e-05, + "loss": 0.4226827621459961, + "memory(GiB)": 129.1, + "step": 210, + "token_acc": 0.8615891614793116, + "train_speed(iter/s)": 0.350194 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.17130540311336517, + "learning_rate": 9.25882923938038e-05, + "loss": 0.38046865463256835, + "memory(GiB)": 129.1, + "step": 215, + "token_acc": 0.8746039856923863, + "train_speed(iter/s)": 0.349574 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.31240323185920715, + "learning_rate": 9.214461357083985e-05, + "loss": 0.35848026275634765, + "memory(GiB)": 129.1, + "step": 220, + "token_acc": 0.8796054540179866, + "train_speed(iter/s)": 0.350614 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4786238670349121, + "eval_runtime": 1.265, + "eval_samples_per_second": 3.162, + "eval_steps_per_second": 3.162, + "eval_token_acc": 0.7549212598425197, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.3606383800506592, + "learning_rate": 9.168916638593736e-05, + "loss": 0.48271026611328127, + "memory(GiB)": 129.1, + "step": 225, + "token_acc": 0.8429425702358118, + "train_speed(iter/s)": 0.349116 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.2114563286304474, + "learning_rate": 9.122207801708802e-05, + "loss": 0.39199128150939944, + "memory(GiB)": 129.1, + "step": 230, + "token_acc": 0.8686677560849746, + "train_speed(iter/s)": 0.347125 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.24240955710411072, + "learning_rate": 9.074347889294016e-05, + "loss": 0.2028397798538208, + "memory(GiB)": 129.1, + "step": 235, + "token_acc": 0.908675799086758, + "train_speed(iter/s)": 0.34928 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.4021816551685333, + "learning_rate": 9.025350265637815e-05, + "loss": 0.44605064392089844, + "memory(GiB)": 129.1, + "step": 240, + "token_acc": 0.853990914990266, + "train_speed(iter/s)": 0.350502 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.477361261844635, + "eval_runtime": 1.2844, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 3.114, + "eval_token_acc": 0.7627952755905512, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.2472194880247116, + "learning_rate": 8.975228612720416e-05, + "loss": 0.28942854404449464, + "memory(GiB)": 129.1, + "step": 245, + "token_acc": 0.8561964776215867, + "train_speed(iter/s)": 0.35052 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.49620357155799866, + "learning_rate": 8.923996926393305e-05, + "loss": 0.4217637062072754, + "memory(GiB)": 129.1, + "step": 250, + "token_acc": 0.8497478099283249, + "train_speed(iter/s)": 0.352588 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.2509404122829437, + "learning_rate": 8.871669512471068e-05, + "loss": 0.3790408134460449, + "memory(GiB)": 129.1, + "step": 255, + "token_acc": 0.859628239172237, + "train_speed(iter/s)": 0.351148 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.39996537566185, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3863436222076416, + "memory(GiB)": 129.1, + "step": 260, + "token_acc": 0.855022437003797, + "train_speed(iter/s)": 0.352259 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.43004661798477173, + "eval_runtime": 1.2631, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 3.167, + "eval_token_acc": 0.7706692913385826, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.13070876896381378, + "learning_rate": 8.763786250861256e-05, + "loss": 0.30726191997528074, + "memory(GiB)": 129.1, + "step": 265, + "token_acc": 0.8950704812745016, + "train_speed(iter/s)": 0.348668 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.29351168870925903, + "learning_rate": 8.708260528239788e-05, + "loss": 0.28099467754364016, + "memory(GiB)": 129.1, + "step": 270, + "token_acc": 0.8945386064030132, + "train_speed(iter/s)": 0.349732 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.29749786853790283, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2923673152923584, + "memory(GiB)": 129.1, + "step": 275, + "token_acc": 0.8954248366013072, + "train_speed(iter/s)": 0.349425 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.5873435139656067, + "learning_rate": 8.594118419389647e-05, + "loss": 0.42062225341796877, + "memory(GiB)": 129.1, + "step": 280, + "token_acc": 0.8647863247863248, + "train_speed(iter/s)": 0.350425 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.4043884575366974, + "eval_runtime": 1.2688, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 3.153, + "eval_token_acc": 0.7726377952755905, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.5294097065925598, + "learning_rate": 8.535533905932738e-05, + "loss": 0.23297641277313233, + "memory(GiB)": 129.1, + "step": 285, + "token_acc": 0.8899554336647241, + "train_speed(iter/s)": 0.350232 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.4506111741065979, + "learning_rate": 8.475962138373213e-05, + "loss": 0.36497814655303956, + "memory(GiB)": 129.1, + "step": 290, + "token_acc": 0.8623964437260052, + "train_speed(iter/s)": 0.351651 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.30804958939552307, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3962693214416504, + "memory(GiB)": 129.1, + "step": 295, + "token_acc": 0.85288089273514, + "train_speed(iter/s)": 0.353152 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.6068087816238403, + "learning_rate": 8.353923650696118e-05, + "loss": 0.3206871509552002, + "memory(GiB)": 129.1, + "step": 300, + "token_acc": 0.8861493836113126, + "train_speed(iter/s)": 0.353426 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.42214763164520264, + "eval_runtime": 1.2774, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 3.131, + "eval_token_acc": 0.7775590551181102, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.3600684404373169, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3652678966522217, + "memory(GiB)": 129.1, + "step": 305, + "token_acc": 0.8686586614539701, + "train_speed(iter/s)": 0.350995 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.33816397190093994, + "learning_rate": 8.228139257794012e-05, + "loss": 0.30328705310821535, + "memory(GiB)": 129.1, + "step": 310, + "token_acc": 0.8997613365155132, + "train_speed(iter/s)": 0.352277 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.849923849105835, + "learning_rate": 8.163886089321493e-05, + "loss": 0.32169332504272463, + "memory(GiB)": 129.1, + "step": 315, + "token_acc": 0.8734251968503937, + "train_speed(iter/s)": 0.353612 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.37628281116485596, + "learning_rate": 8.098749444801224e-05, + "loss": 0.33423264026641847, + "memory(GiB)": 129.1, + "step": 320, + "token_acc": 0.9003220364974697, + "train_speed(iter/s)": 0.354058 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.4008274972438812, + "eval_runtime": 1.2671, + "eval_samples_per_second": 3.157, + "eval_steps_per_second": 3.157, + "eval_token_acc": 0.7726377952755905, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.37696319818496704, + "learning_rate": 8.032747512835337e-05, + "loss": 0.35271801948547366, + "memory(GiB)": 129.1, + "step": 325, + "token_acc": 0.8523900054318305, + "train_speed(iter/s)": 0.353524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.574674129486084, + "learning_rate": 7.965898723646776e-05, + "loss": 0.38188652992248534, + "memory(GiB)": 129.1, + "step": 330, + "token_acc": 0.8839562254800744, + "train_speed(iter/s)": 0.35462 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.31084558367729187, + "learning_rate": 7.898221743932888e-05, + "loss": 0.38109359741210935, + "memory(GiB)": 129.1, + "step": 335, + "token_acc": 0.8710840033268644, + "train_speed(iter/s)": 0.354627 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.7837309837341309, + "learning_rate": 7.829735471652978e-05, + "loss": 0.26326937675476075, + "memory(GiB)": 129.1, + "step": 340, + "token_acc": 0.9064136125654451, + "train_speed(iter/s)": 0.355662 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.3601945638656616, + "eval_runtime": 1.2553, + "eval_samples_per_second": 3.186, + "eval_steps_per_second": 3.186, + "eval_token_acc": 0.7726377952755905, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.326943576335907, + "learning_rate": 7.760459030751284e-05, + "loss": 0.2819732904434204, + "memory(GiB)": 129.1, + "step": 345, + "token_acc": 0.894602905312268, + "train_speed(iter/s)": 0.35432 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.0278775691986084, + "learning_rate": 7.690411765816864e-05, + "loss": 0.16738426685333252, + "memory(GiB)": 129.1, + "step": 350, + "token_acc": 0.9215094339622641, + "train_speed(iter/s)": 0.355229 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.6167912483215332, + "learning_rate": 7.619613236681843e-05, + "loss": 0.4713289260864258, + "memory(GiB)": 129.1, + "step": 355, + "token_acc": 0.8477546549835706, + "train_speed(iter/s)": 0.355466 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.26643773913383484, + "learning_rate": 7.548083212959588e-05, + "loss": 0.259800124168396, + "memory(GiB)": 129.1, + "step": 360, + "token_acc": 0.8968109615617801, + "train_speed(iter/s)": 0.355753 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.37819036841392517, + "eval_runtime": 1.3003, + "eval_samples_per_second": 3.076, + "eval_steps_per_second": 3.076, + "eval_token_acc": 0.7785433070866141, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.709635853767395, + "learning_rate": 7.475841668524268e-05, + "loss": 0.3847909212112427, + "memory(GiB)": 129.1, + "step": 365, + "token_acc": 0.8547150949683439, + "train_speed(iter/s)": 0.355576 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.28540483117103577, + "learning_rate": 7.402908775933419e-05, + "loss": 0.35011069774627684, + "memory(GiB)": 129.1, + "step": 370, + "token_acc": 0.8763573543928924, + "train_speed(iter/s)": 0.355573 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.4313144385814667, + "learning_rate": 7.329304900794991e-05, + "loss": 0.4088496208190918, + "memory(GiB)": 129.1, + "step": 375, + "token_acc": 0.8652825291966497, + "train_speed(iter/s)": 0.355232 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.6075259447097778, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3548138618469238, + "memory(GiB)": 129.1, + "step": 380, + "token_acc": 0.8879879054425509, + "train_speed(iter/s)": 0.35565 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.373136043548584, + "eval_runtime": 1.2667, + "eval_samples_per_second": 3.158, + "eval_steps_per_second": 3.158, + "eval_token_acc": 0.7706692913385826, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.1639028936624527, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3757177352905273, + "memory(GiB)": 129.1, + "step": 385, + "token_acc": 0.882828778036524, + "train_speed(iter/s)": 0.354333 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.32813289761543274, + "learning_rate": 7.104673812141675e-05, + "loss": 0.25887558460235593, + "memory(GiB)": 129.1, + "step": 390, + "token_acc": 0.8980108083247097, + "train_speed(iter/s)": 0.353935 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3131479322910309, + "learning_rate": 7.02859332377382e-05, + "loss": 0.254361891746521, + "memory(GiB)": 129.1, + "step": 395, + "token_acc": 0.9006518318723309, + "train_speed(iter/s)": 0.354813 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5802826881408691, + "learning_rate": 6.951946375817474e-05, + "loss": 0.22614221572875975, + "memory(GiB)": 129.1, + "step": 400, + "token_acc": 0.9400690304361469, + "train_speed(iter/s)": 0.356043 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3536229729652405, + "eval_runtime": 1.2885, + "eval_samples_per_second": 3.104, + "eval_steps_per_second": 3.104, + "eval_token_acc": 0.7706692913385826, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6191949844360352, + "learning_rate": 6.874754370984606e-05, + "loss": 0.15614408254623413, + "memory(GiB)": 129.1, + "step": 405, + "token_acc": 0.9098951953178168, + "train_speed(iter/s)": 0.355365 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.7725083231925964, + "learning_rate": 6.797038864187564e-05, + "loss": 0.2733434200286865, + "memory(GiB)": 129.1, + "step": 410, + "token_acc": 0.9188966652943599, + "train_speed(iter/s)": 0.355292 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.3713182210922241, + "learning_rate": 6.718821556520151e-05, + "loss": 0.1949324369430542, + "memory(GiB)": 129.1, + "step": 415, + "token_acc": 0.9226980728051392, + "train_speed(iter/s)": 0.355986 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.2845621109008789, + "learning_rate": 6.640124289197845e-05, + "loss": 0.09827777743339539, + "memory(GiB)": 129.1, + "step": 420, + "token_acc": 0.9717413441955194, + "train_speed(iter/s)": 0.356829 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.3582577109336853, + "eval_runtime": 1.2794, + "eval_samples_per_second": 3.126, + "eval_steps_per_second": 3.126, + "eval_token_acc": 0.7667322834645669, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.607758641242981, + "learning_rate": 6.560969037458933e-05, + "loss": 0.19055347442626952, + "memory(GiB)": 129.1, + "step": 425, + "token_acc": 0.8919627256843331, + "train_speed(iter/s)": 0.356579 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.4930749237537384, + "learning_rate": 6.481377904428171e-05, + "loss": 0.1357766032218933, + "memory(GiB)": 129.1, + "step": 430, + "token_acc": 0.9581637268204433, + "train_speed(iter/s)": 0.356034 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.2689533531665802, + "learning_rate": 6.401373114944781e-05, + "loss": 0.18788766860961914, + "memory(GiB)": 129.1, + "step": 435, + "token_acc": 0.9421218961625282, + "train_speed(iter/s)": 0.355275 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.9979881644248962, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2125793695449829, + "memory(GiB)": 129.1, + "step": 440, + "token_acc": 0.9272427983539094, + "train_speed(iter/s)": 0.355873 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.3688502013683319, + "eval_runtime": 1.2802, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 3.124, + "eval_token_acc": 0.7706692913385826, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.37610238790512085, + "learning_rate": 6.240212037280966e-05, + "loss": 0.11968926191329957, + "memory(GiB)": 129.1, + "step": 445, + "token_acc": 0.9304094308530866, + "train_speed(iter/s)": 0.355188 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.7159104943275452, + "learning_rate": 6.159100751337642e-05, + "loss": 0.26689648628234863, + "memory(GiB)": 129.1, + "step": 450, + "token_acc": 0.9058954807513483, + "train_speed(iter/s)": 0.355771 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.29641231894493103, + "learning_rate": 6.077665800849568e-05, + "loss": 0.1854721188545227, + "memory(GiB)": 129.1, + "step": 455, + "token_acc": 0.9324742268041237, + "train_speed(iter/s)": 0.355991 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.25186142325401306, + "learning_rate": 5.99592992551918e-05, + "loss": 0.193935763835907, + "memory(GiB)": 129.1, + "step": 460, + "token_acc": 0.9377682403433476, + "train_speed(iter/s)": 0.356327 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3633354604244232, + "eval_runtime": 1.2615, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 3.171, + "eval_token_acc": 0.7706692913385826, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.49181655049324036, + "learning_rate": 5.913915949078452e-05, + "loss": 0.176645827293396, + "memory(GiB)": 129.1, + "step": 465, + "token_acc": 0.8921251348435815, + "train_speed(iter/s)": 0.356557 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.5645484328269958, + "learning_rate": 5.831646772915651e-05, + "loss": 0.13740575313568115, + "memory(GiB)": 129.1, + "step": 470, + "token_acc": 0.9428538968416269, + "train_speed(iter/s)": 0.356147 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 1.3345602750778198, + "learning_rate": 5.749145369680407e-05, + "loss": 0.21261224746704102, + "memory(GiB)": 129.1, + "step": 475, + "token_acc": 0.9206197398622801, + "train_speed(iter/s)": 0.356639 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.3252560496330261, + "learning_rate": 5.666434776868895e-05, + "loss": 0.2075648546218872, + "memory(GiB)": 129.1, + "step": 480, + "token_acc": 0.9252907219944784, + "train_speed(iter/s)": 0.355528 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.36280357837677, + "eval_runtime": 1.2645, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7667322834645669, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.6244832277297974, + "learning_rate": 5.583538090390882e-05, + "loss": 0.13927946090698243, + "memory(GiB)": 129.1, + "step": 485, + "token_acc": 0.8984392671341326, + "train_speed(iter/s)": 0.355772 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5457295179367065, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.28371801376342776, + "memory(GiB)": 129.1, + "step": 490, + "token_acc": 0.8928283642224013, + "train_speed(iter/s)": 0.355994 + }, + { + "epoch": 2.5, + "grad_norm": 0.26068228483200073, + "learning_rate": 5.41727907343245e-05, + "loss": 0.16324831247329713, + "memory(GiB)": 129.1, + "step": 495, + "token_acc": 0.9382183908045977, + "train_speed(iter/s)": 0.356737 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.4725530445575714, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.23240807056427001, + "memory(GiB)": 129.1, + "step": 500, + "token_acc": 0.8918985471558729, + "train_speed(iter/s)": 0.356548 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.36516159772872925, + "eval_runtime": 1.2784, + "eval_samples_per_second": 3.129, + "eval_steps_per_second": 3.129, + "eval_token_acc": 0.7706692913385826, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.5990637540817261, + "learning_rate": 5.250554008935596e-05, + "loss": 0.15861610174179078, + "memory(GiB)": 129.1, + "step": 505, + "token_acc": 0.9060025910464949, + "train_speed(iter/s)": 0.356161 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.7070275545120239, + "learning_rate": 5.167074885038373e-05, + "loss": 0.16548032760620118, + "memory(GiB)": 129.1, + "step": 510, + "token_acc": 0.9370354175776126, + "train_speed(iter/s)": 0.357031 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.30997011065483093, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.18845115900039672, + "memory(GiB)": 129.1, + "step": 515, + "token_acc": 0.9271042471042471, + "train_speed(iter/s)": 0.357488 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.34516477584838867, + "learning_rate": 5e-05, + "loss": 0.19470884799957275, + "memory(GiB)": 129.1, + "step": 520, + "token_acc": 0.9188622362039586, + "train_speed(iter/s)": 0.357567 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.35303670167922974, + "eval_runtime": 1.2716, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 3.146, + "eval_token_acc": 0.765748031496063, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.400846928358078, + "learning_rate": 4.916450892453495e-05, + "loss": 0.16326183080673218, + "memory(GiB)": 129.1, + "step": 525, + "token_acc": 0.9170015455950541, + "train_speed(iter/s)": 0.357449 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3526351749897003, + "learning_rate": 4.832925114961629e-05, + "loss": 0.2275157690048218, + "memory(GiB)": 129.1, + "step": 530, + "token_acc": 0.911013136584488, + "train_speed(iter/s)": 0.356693 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7750332355499268, + "learning_rate": 4.749445991064404e-05, + "loss": 0.16660224199295043, + "memory(GiB)": 129.1, + "step": 535, + "token_acc": 0.9465856041689285, + "train_speed(iter/s)": 0.356093 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.008236289024353, + "learning_rate": 4.666036831274392e-05, + "loss": 0.29327480792999266, + "memory(GiB)": 129.1, + "step": 540, + "token_acc": 0.893456980937661, + "train_speed(iter/s)": 0.355675 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.34143465757369995, + "eval_runtime": 1.2634, + "eval_samples_per_second": 3.166, + "eval_steps_per_second": 3.166, + "eval_token_acc": 0.764763779527559, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.5062530040740967, + "learning_rate": 4.582720926567552e-05, + "loss": 0.221860933303833, + "memory(GiB)": 129.1, + "step": 545, + "token_acc": 0.8882824294507026, + "train_speed(iter/s)": 0.355094 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4163118004798889, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.171502685546875, + "memory(GiB)": 129.1, + "step": 550, + "token_acc": 0.9311728853872454, + "train_speed(iter/s)": 0.355204 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.6872218251228333, + "learning_rate": 4.416461909609119e-05, + "loss": 0.18130356073379517, + "memory(GiB)": 129.1, + "step": 555, + "token_acc": 0.9372671732975711, + "train_speed(iter/s)": 0.35566 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.2589365839958191, + "learning_rate": 4.333565223131107e-05, + "loss": 0.15754028558731079, + "memory(GiB)": 129.1, + "step": 560, + "token_acc": 0.9293805736322005, + "train_speed(iter/s)": 0.355647 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.328652024269104, + "eval_runtime": 1.2642, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.764763779527559, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.6874573230743408, + "learning_rate": 4.250854630319593e-05, + "loss": 0.2055502414703369, + "memory(GiB)": 129.1, + "step": 565, + "token_acc": 0.9031949899161448, + "train_speed(iter/s)": 0.355089 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6083143949508667, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.25478286743164064, + "memory(GiB)": 129.1, + "step": 570, + "token_acc": 0.903887358432813, + "train_speed(iter/s)": 0.354778 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.9529440999031067, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.17642589807510375, + "memory(GiB)": 129.1, + "step": 575, + "token_acc": 0.9331498230436492, + "train_speed(iter/s)": 0.354847 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 0.17912031710147858, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.24578819274902344, + "memory(GiB)": 129.1, + "step": 580, + "token_acc": 0.8957880166106387, + "train_speed(iter/s)": 0.355458 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.32831043004989624, + "eval_runtime": 1.2645, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 3.163, + "eval_token_acc": 0.7706692913385826, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.7658194303512573, + "learning_rate": 3.922334199150432e-05, + "loss": 0.21303670406341552, + "memory(GiB)": 129.1, + "step": 585, + "token_acc": 0.8920780711825488, + "train_speed(iter/s)": 0.355426 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.7090197801589966, + "learning_rate": 3.840899248662358e-05, + "loss": 0.19972538948059082, + "memory(GiB)": 129.1, + "step": 590, + "token_acc": 0.9341101694915255, + "train_speed(iter/s)": 0.354892 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.26023608446121216, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.2259267807006836, + "memory(GiB)": 129.1, + "step": 595, + "token_acc": 0.9335699797160243, + "train_speed(iter/s)": 0.354377 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.8862583041191101, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.13799512386322021, + "memory(GiB)": 129.1, + "step": 600, + "token_acc": 0.957043945174509, + "train_speed(iter/s)": 0.354572 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.30323198437690735, + "eval_runtime": 1.2806, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 3.123, + "eval_token_acc": 0.7706692913385826, + "step": 600 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.5918054580688477, + "learning_rate": 3.598626885055219e-05, + "loss": 0.08972094058990479, + "memory(GiB)": 129.1, + "step": 605, + "token_acc": 0.9397115384615384, + "train_speed(iter/s)": 0.3534 + }, + { + "epoch": 3.080808080808081, + "grad_norm": 0.7078537940979004, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.04932542443275452, + "memory(GiB)": 129.1, + "step": 610, + "token_acc": 0.975853123129116, + "train_speed(iter/s)": 0.354057 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.13521677255630493, + "learning_rate": 3.4390309625410686e-05, + "loss": 0.03949523568153381, + "memory(GiB)": 129.1, + "step": 615, + "token_acc": 0.9892593421347058, + "train_speed(iter/s)": 0.354635 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 0.430328905582428, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.08747856616973877, + "memory(GiB)": 129.1, + "step": 620, + "token_acc": 0.9691932624113475, + "train_speed(iter/s)": 0.354612 + }, + { + "epoch": 3.1313131313131315, + "eval_loss": 0.306226909160614, + "eval_runtime": 1.3165, + "eval_samples_per_second": 3.038, + "eval_steps_per_second": 3.038, + "eval_token_acc": 0.7696850393700787, + "step": 620 + }, + { + "epoch": 3.1565656565656566, + "grad_norm": 0.376487672328949, + "learning_rate": 3.281178443479852e-05, + "loss": 0.09797981977462769, + "memory(GiB)": 129.1, + "step": 625, + "token_acc": 0.9492996646281318, + "train_speed(iter/s)": 0.353631 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.7452987432479858, + "learning_rate": 3.202961135812437e-05, + "loss": 0.07529096603393555, + "memory(GiB)": 129.1, + "step": 630, + "token_acc": 0.9698409419541417, + "train_speed(iter/s)": 0.354143 + }, + { + "epoch": 3.207070707070707, + "grad_norm": 0.5807965993881226, + "learning_rate": 3.1252456290153954e-05, + "loss": 0.1325251579284668, + "memory(GiB)": 129.1, + "step": 635, + "token_acc": 0.9434507276969225, + "train_speed(iter/s)": 0.354003 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.4517095685005188, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.11141908168792725, + "memory(GiB)": 129.1, + "step": 640, + "token_acc": 0.9503339290753456, + "train_speed(iter/s)": 0.353366 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.30879175662994385, + "eval_runtime": 1.3167, + "eval_samples_per_second": 3.038, + "eval_steps_per_second": 3.038, + "eval_token_acc": 0.765748031496063, + "step": 640 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.4209767282009125, + "learning_rate": 2.9714066762261823e-05, + "loss": 0.08485085368156434, + "memory(GiB)": 129.1, + "step": 645, + "token_acc": 0.9452848128619586, + "train_speed(iter/s)": 0.352332 + }, + { + "epoch": 3.282828282828283, + "grad_norm": 0.32493889331817627, + "learning_rate": 2.895326187858326e-05, + "loss": 0.12707052230834961, + "memory(GiB)": 129.1, + "step": 650, + "token_acc": 0.9538642869169894, + "train_speed(iter/s)": 0.352802 + }, + { + "epoch": 3.308080808080808, + "grad_norm": 0.4984124004840851, + "learning_rate": 2.8198334036140874e-05, + "loss": 0.09735980033874511, + "memory(GiB)": 129.1, + "step": 655, + "token_acc": 0.9682322541419669, + "train_speed(iter/s)": 0.352963 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.2004961520433426, + "learning_rate": 2.74494940391949e-05, + "loss": 0.09166445732116699, + "memory(GiB)": 129.1, + "step": 660, + "token_acc": 0.9631701631701631, + "train_speed(iter/s)": 0.353022 + }, + { + "epoch": 3.3333333333333335, + "eval_loss": 0.3101058900356293, + "eval_runtime": 1.2782, + "eval_samples_per_second": 3.129, + "eval_steps_per_second": 3.129, + "eval_token_acc": 0.7627952755905512, + "step": 660 + }, + { + "epoch": 3.3585858585858586, + "grad_norm": 0.8963623046875, + "learning_rate": 2.6706950992050094e-05, + "loss": 0.09054631590843201, + "memory(GiB)": 129.1, + "step": 665, + "token_acc": 0.9231199850718418, + "train_speed(iter/s)": 0.353076 + }, + { + "epoch": 3.3838383838383836, + "grad_norm": 0.8532351851463318, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.10762099027633668, + "memory(GiB)": 129.1, + "step": 670, + "token_acc": 0.96045197740113, + "train_speed(iter/s)": 0.353315 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.36199483275413513, + "learning_rate": 2.5241583314757327e-05, + "loss": 0.07633500695228576, + "memory(GiB)": 129.1, + "step": 675, + "token_acc": 0.9642299010244835, + "train_speed(iter/s)": 0.353653 + }, + { + "epoch": 3.4343434343434343, + "grad_norm": 0.6680567860603333, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.1227030634880066, + "memory(GiB)": 129.1, + "step": 680, + "token_acc": 0.942090395480226, + "train_speed(iter/s)": 0.354043 + }, + { + "epoch": 3.4343434343434343, + "eval_loss": 0.3102591335773468, + "eval_runtime": 1.2774, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 3.131, + "eval_token_acc": 0.7578740157480315, + "step": 680 + }, + { + "epoch": 3.45959595959596, + "grad_norm": 0.2870488464832306, + "learning_rate": 2.3803867633181574e-05, + "loss": 0.11645561456680298, + "memory(GiB)": 129.1, + "step": 685, + "token_acc": 0.9112238427393786, + "train_speed(iter/s)": 0.353305 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.666077196598053, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.13571174144744874, + "memory(GiB)": 129.1, + "step": 690, + "token_acc": 0.9530231512699483, + "train_speed(iter/s)": 0.353078 + }, + { + "epoch": 3.51010101010101, + "grad_norm": 0.39280450344085693, + "learning_rate": 2.2395409692487175e-05, + "loss": 0.11649401187896728, + "memory(GiB)": 129.1, + "step": 695, + "token_acc": 0.9593987292732062, + "train_speed(iter/s)": 0.353484 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.24035465717315674, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.038886070251464844, + "memory(GiB)": 129.1, + "step": 700, + "token_acc": 0.9857142857142858, + "train_speed(iter/s)": 0.354064 + }, + { + "epoch": 3.5353535353535355, + "eval_loss": 0.3143807351589203, + "eval_runtime": 1.2708, + "eval_samples_per_second": 3.148, + "eval_steps_per_second": 3.148, + "eval_token_acc": 0.7588582677165354, + "step": 700 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.3898966610431671, + "learning_rate": 2.1017782560671123e-05, + "loss": 0.06877344250679016, + "memory(GiB)": 129.1, + "step": 705, + "token_acc": 0.9332363107149354, + "train_speed(iter/s)": 0.354088 + }, + { + "epoch": 3.5858585858585856, + "grad_norm": 0.577021062374115, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.0904355764389038, + "memory(GiB)": 129.1, + "step": 710, + "token_acc": 0.9646369533375183, + "train_speed(iter/s)": 0.353937 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.015718597918748856, + "learning_rate": 1.967252487164663e-05, + "loss": 0.008103035390377045, + "memory(GiB)": 129.1, + "step": 715, + "token_acc": 0.9974391805377721, + "train_speed(iter/s)": 0.354851 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.6149921417236328, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.0557898998260498, + "memory(GiB)": 129.1, + "step": 720, + "token_acc": 0.9812704501861672, + "train_speed(iter/s)": 0.354676 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.31738603115081787, + "eval_runtime": 1.2704, + "eval_samples_per_second": 3.149, + "eval_steps_per_second": 3.149, + "eval_token_acc": 0.7618110236220472, + "step": 720 + }, + { + "epoch": 3.6616161616161618, + "grad_norm": 0.5522142052650452, + "learning_rate": 1.836113910678507e-05, + "loss": 0.1061089038848877, + "memory(GiB)": 129.1, + "step": 725, + "token_acc": 0.9314117647058824, + "train_speed(iter/s)": 0.354276 + }, + { + "epoch": 3.686868686868687, + "grad_norm": 0.3203338086605072, + "learning_rate": 1.771860742205988e-05, + "loss": 0.11565899848937988, + "memory(GiB)": 129.1, + "step": 730, + "token_acc": 0.954354001371339, + "train_speed(iter/s)": 0.35397 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.5480756163597107, + "learning_rate": 1.7085089916835923e-05, + "loss": 0.09632692337036133, + "memory(GiB)": 129.1, + "step": 735, + "token_acc": 0.9580305687797545, + "train_speed(iter/s)": 0.353862 + }, + { + "epoch": 3.7373737373737375, + "grad_norm": 0.710368812084198, + "learning_rate": 1.646076349303884e-05, + "loss": 0.12017930746078491, + "memory(GiB)": 129.1, + "step": 740, + "token_acc": 0.9624349836255057, + "train_speed(iter/s)": 0.354334 + }, + { + "epoch": 3.7373737373737375, + "eval_loss": 0.3139352798461914, + "eval_runtime": 1.2642, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 3.164, + "eval_token_acc": 0.7598425196850394, + "step": 740 + }, + { + "epoch": 3.7626262626262625, + "grad_norm": 0.3609558939933777, + "learning_rate": 1.584580248609846e-05, + "loss": 0.06079275012016296, + "memory(GiB)": 129.1, + "step": 745, + "token_acc": 0.9102909482758621, + "train_speed(iter/s)": 0.354549 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.7200530767440796, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.03985466659069061, + "memory(GiB)": 129.1, + "step": 750, + "token_acc": 0.9833689712520789, + "train_speed(iter/s)": 0.355155 + }, + { + "epoch": 3.813131313131313, + "grad_norm": 0.47436633706092834, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.18646769523620604, + "memory(GiB)": 129.1, + "step": 755, + "token_acc": 0.922202486678508, + "train_speed(iter/s)": 0.354059 + }, + { + "epoch": 3.8383838383838382, + "grad_norm": 0.4159301817417145, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.017752669751644135, + "memory(GiB)": 129.1, + "step": 760, + "token_acc": 0.9946977730646872, + "train_speed(iter/s)": 0.354791 + }, + { + "epoch": 3.8383838383838382, + "eval_loss": 0.31617671251296997, + "eval_runtime": 1.2713, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 3.146, + "eval_token_acc": 0.7549212598425197, + "step": 760 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.630095362663269, + "learning_rate": 1.3483006802566544e-05, + "loss": 0.11076927185058594, + "memory(GiB)": 129.1, + "step": 765, + "token_acc": 0.9201399452388196, + "train_speed(iter/s)": 0.354628 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.17331808805465698, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.06400647759437561, + "memory(GiB)": 129.1, + "step": 770, + "token_acc": 0.9771947032859245, + "train_speed(iter/s)": 0.354776 + }, + { + "epoch": 3.9141414141414144, + "grad_norm": 0.5865471959114075, + "learning_rate": 1.2362137491387432e-05, + "loss": 0.09050332307815552, + "memory(GiB)": 129.1, + "step": 775, + "token_acc": 0.9741574731751549, + "train_speed(iter/s)": 0.354814 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.4024655818939209, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.09545594453811646, + "memory(GiB)": 129.1, + "step": 780, + "token_acc": 0.9562637969094923, + "train_speed(iter/s)": 0.354768 + }, + { + "epoch": 3.9393939393939394, + "eval_loss": 0.32290545105934143, + "eval_runtime": 1.2807, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 3.123, + "eval_token_acc": 0.7588582677165354, + "step": 780 + }, + { + "epoch": 3.9646464646464645, + "grad_norm": 2.198448419570923, + "learning_rate": 1.1283304875289336e-05, + "loss": 0.0838412582874298, + "memory(GiB)": 129.1, + "step": 785, + "token_acc": 0.9169477234401349, + "train_speed(iter/s)": 0.354661 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 0.8399145603179932, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.09766408801078796, + "memory(GiB)": 129.1, + "step": 790, + "token_acc": 0.9677739216658403, + "train_speed(iter/s)": 0.355019 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.5167694091796875, + "learning_rate": 1.024771387279585e-05, + "loss": 0.08901907801628113, + "memory(GiB)": 129.1, + "step": 795, + "token_acc": 0.9699577530902832, + "train_speed(iter/s)": 0.355271 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 0.29580003023147583, + "learning_rate": 9.746497343621857e-06, + "loss": 0.03536704182624817, + "memory(GiB)": 129.1, + "step": 800, + "token_acc": 0.9874716779954685, + "train_speed(iter/s)": 0.355303 + }, + { + "epoch": 4.040404040404041, + "eval_loss": 0.32690542936325073, + "eval_runtime": 1.2877, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 3.106, + "eval_token_acc": 0.7588582677165354, + "step": 800 + }, + { + "epoch": 4.065656565656566, + "grad_norm": 0.6783302426338196, + "learning_rate": 9.256521107059834e-06, + "loss": 0.06580750346183777, + "memory(GiB)": 129.1, + "step": 805, + "token_acc": 0.9491756538985993, + "train_speed(iter/s)": 0.354851 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.02713463269174099, + "learning_rate": 8.777921982911996e-06, + "loss": 0.018977776169776917, + "memory(GiB)": 129.1, + "step": 810, + "token_acc": 0.9939244351623315, + "train_speed(iter/s)": 0.355276 + }, + { + "epoch": 4.116161616161616, + "grad_norm": 0.47375404834747314, + "learning_rate": 8.310833614062651e-06, + "loss": 0.04440165162086487, + "memory(GiB)": 129.1, + "step": 815, + "token_acc": 0.9830610103432769, + "train_speed(iter/s)": 0.35558 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.2699472904205322, + "learning_rate": 7.85538642916015e-06, + "loss": 0.0346536248922348, + "memory(GiB)": 129.1, + "step": 820, + "token_acc": 0.9884083816317432, + "train_speed(iter/s)": 0.355478 + }, + { + "epoch": 4.141414141414141, + "eval_loss": 0.33224982023239136, + "eval_runtime": 1.266, + "eval_samples_per_second": 3.16, + "eval_steps_per_second": 3.16, + "eval_token_acc": 0.7578740157480315, + "step": 820 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.46439409255981445, + "learning_rate": 7.4117076061961885e-06, + "loss": 0.060616308450698854, + "memory(GiB)": 129.1, + "step": 825, + "token_acc": 0.9562847370671227, + "train_speed(iter/s)": 0.354922 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 0.32245391607284546, + "learning_rate": 6.979921036993042e-06, + "loss": 0.07455227375030518, + "memory(GiB)": 129.1, + "step": 830, + "token_acc": 0.9444043321299639, + "train_speed(iter/s)": 0.354969 + }, + { + "epoch": 4.217171717171717, + "grad_norm": 0.6579491496086121, + "learning_rate": 6.5601472926081766e-06, + "loss": 0.09966359734535217, + "memory(GiB)": 129.1, + "step": 835, + "token_acc": 0.9565217391304348, + "train_speed(iter/s)": 0.355055 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.44027379155158997, + "learning_rate": 6.152503589666425e-06, + "loss": 0.07703586220741272, + "memory(GiB)": 129.1, + "step": 840, + "token_acc": 0.9676385773790451, + "train_speed(iter/s)": 0.354947 + }, + { + "epoch": 4.242424242424242, + "eval_loss": 0.3352506160736084, + "eval_runtime": 1.2662, + "eval_samples_per_second": 3.159, + "eval_steps_per_second": 3.159, + "eval_token_acc": 0.7568897637795275, + "step": 840 + }, + { + "epoch": 4.267676767676767, + "grad_norm": 0.8359324932098389, + "learning_rate": 5.757103757628573e-06, + "loss": 0.1429282546043396, + "memory(GiB)": 129.1, + "step": 845, + "token_acc": 0.922793074084587, + "train_speed(iter/s)": 0.354142 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 0.27999410033226013, + "learning_rate": 5.374058207005944e-06, + "loss": 0.10208557844161988, + "memory(GiB)": 129.1, + "step": 850, + "token_acc": 0.961126817447496, + "train_speed(iter/s)": 0.353931 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.39005765318870544, + "learning_rate": 5.0034738985296095e-06, + "loss": 0.035950151085853574, + "memory(GiB)": 129.1, + "step": 855, + "token_acc": 0.9865144100054377, + "train_speed(iter/s)": 0.353884 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 0.254375696182251, + "learning_rate": 4.645454313282965e-06, + "loss": 0.034918776154518126, + "memory(GiB)": 129.1, + "step": 860, + "token_acc": 0.9884200718754159, + "train_speed(iter/s)": 0.353895 + }, + { + "epoch": 4.343434343434343, + "eval_loss": 0.3357633650302887, + "eval_runtime": 1.2612, + "eval_samples_per_second": 3.172, + "eval_steps_per_second": 3.172, + "eval_token_acc": 0.7539370078740157, + "step": 860 + }, + { + "epoch": 4.3686868686868685, + "grad_norm": 0.5025836229324341, + "learning_rate": 4.3000994238058644e-06, + "loss": 0.04228464365005493, + "memory(GiB)": 129.1, + "step": 865, + "token_acc": 0.954431050470261, + "train_speed(iter/s)": 0.353642 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.6179090142250061, + "learning_rate": 3.967505666178556e-06, + "loss": 0.03816230297088623, + "memory(GiB)": 129.1, + "step": 870, + "token_acc": 0.9892336922102597, + "train_speed(iter/s)": 0.35391 + }, + { + "epoch": 4.41919191919192, + "grad_norm": 0.42816147208213806, + "learning_rate": 3.647765913093132e-06, + "loss": 0.023942221701145173, + "memory(GiB)": 129.1, + "step": 875, + "token_acc": 0.9865377322715206, + "train_speed(iter/s)": 0.354293 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.40714192390441895, + "learning_rate": 3.340969447919873e-06, + "loss": 0.05464286208152771, + "memory(GiB)": 129.1, + "step": 880, + "token_acc": 0.9736096615476368, + "train_speed(iter/s)": 0.354416 + }, + { + "epoch": 4.444444444444445, + "eval_loss": 0.3359661400318146, + "eval_runtime": 1.3133, + "eval_samples_per_second": 3.046, + "eval_steps_per_second": 3.046, + "eval_token_acc": 0.7559055118110236, + "step": 880 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.34820908308029175, + "learning_rate": 3.0472019397761064e-06, + "loss": 0.03318539261817932, + "memory(GiB)": 129.1, + "step": 885, + "token_acc": 0.9598147220831247, + "train_speed(iter/s)": 0.35415 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 0.42267289757728577, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.05409420132637024, + "memory(GiB)": 129.1, + "step": 890, + "token_acc": 0.9825274278748476, + "train_speed(iter/s)": 0.354092 + }, + { + "epoch": 4.52020202020202, + "grad_norm": 0.4588811993598938, + "learning_rate": 2.4990782572647975e-06, + "loss": 0.024935531616210937, + "memory(GiB)": 129.1, + "step": 895, + "token_acc": 0.9860741347532255, + "train_speed(iter/s)": 0.354453 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.6112403869628906, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.11395398378372193, + "memory(GiB)": 129.1, + "step": 900, + "token_acc": 0.9591113972955569, + "train_speed(iter/s)": 0.354293 + }, + { + "epoch": 4.545454545454545, + "eval_loss": 0.3369322121143341, + "eval_runtime": 1.3215, + "eval_samples_per_second": 3.027, + "eval_steps_per_second": 3.027, + "eval_token_acc": 0.7549212598425197, + "step": 900 + }, + { + "epoch": 4.570707070707071, + "grad_norm": 0.2242911458015442, + "learning_rate": 2.004007049848461e-06, + "loss": 0.00986407846212387, + "memory(GiB)": 129.1, + "step": 905, + "token_acc": 0.9589934762348555, + "train_speed(iter/s)": 0.354216 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 0.14168986678123474, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.015086154639720916, + "memory(GiB)": 129.1, + "step": 910, + "token_acc": 0.9951715374841169, + "train_speed(iter/s)": 0.354789 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.027493759989738464, + "learning_rate": 1.5625412489637337e-06, + "loss": 0.0722315788269043, + "memory(GiB)": 129.1, + "step": 915, + "token_acc": 0.9734163755126842, + "train_speed(iter/s)": 0.354981 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 0.2218606173992157, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.029132437705993653, + "memory(GiB)": 129.1, + "step": 920, + "token_acc": 0.9898477157360406, + "train_speed(iter/s)": 0.355396 + }, + { + "epoch": 4.646464646464646, + "eval_loss": 0.33511751890182495, + "eval_runtime": 1.2779, + "eval_samples_per_second": 3.13, + "eval_steps_per_second": 3.13, + "eval_token_acc": 0.7549212598425197, + "step": 920 + }, + { + "epoch": 4.671717171717171, + "grad_norm": 0.5357446074485779, + "learning_rate": 1.1751739156407649e-06, + "loss": 0.014484831690788269, + "memory(GiB)": 129.1, + "step": 925, + "token_acc": 0.9550669216061185, + "train_speed(iter/s)": 0.355269 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.034110553562641144, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.09904469847679138, + "memory(GiB)": 129.1, + "step": 930, + "token_acc": 0.9562597200622084, + "train_speed(iter/s)": 0.355569 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.46665722131729126, + "learning_rate": 8.423376898168245e-07, + "loss": 0.09365715980529785, + "memory(GiB)": 129.1, + "step": 935, + "token_acc": 0.9637131611788423, + "train_speed(iter/s)": 0.355182 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 0.08903438597917557, + "learning_rate": 6.964873004985717e-07, + "loss": 0.04413898587226868, + "memory(GiB)": 129.1, + "step": 940, + "token_acc": 0.9745094750964279, + "train_speed(iter/s)": 0.355403 + }, + { + "epoch": 4.747474747474747, + "eval_loss": 0.33372214436531067, + "eval_runtime": 1.3056, + "eval_samples_per_second": 3.064, + "eval_steps_per_second": 3.064, + "eval_token_acc": 0.7549212598425197, + "step": 940 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.2658371031284332, + "learning_rate": 5.644043071326932e-07, + "loss": 0.039725151658058164, + "memory(GiB)": 129.1, + "step": 945, + "token_acc": 0.9612076380526406, + "train_speed(iter/s)": 0.354412 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 0.03347943350672722, + "learning_rate": 4.461255922609986e-07, + "loss": 0.03821060359477997, + "memory(GiB)": 129.1, + "step": 950, + "token_acc": 0.975949000289771, + "train_speed(iter/s)": 0.354365 + }, + { + "epoch": 4.8232323232323235, + "grad_norm": 0.37791627645492554, + "learning_rate": 3.416841837512952e-07, + "loss": 0.01510193943977356, + "memory(GiB)": 129.1, + "step": 955, + "token_acc": 0.995788365359305, + "train_speed(iter/s)": 0.354476 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.6039087176322937, + "learning_rate": 2.511092455747932e-07, + "loss": 0.08025823831558228, + "memory(GiB)": 129.1, + "step": 960, + "token_acc": 0.9634054135793134, + "train_speed(iter/s)": 0.354659 + }, + { + "epoch": 4.848484848484849, + "eval_loss": 0.33536115288734436, + "eval_runtime": 1.2813, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 3.122, + "eval_token_acc": 0.7559055118110236, + "step": 960 + }, + { + "epoch": 4.873737373737374, + "grad_norm": 0.24173401296138763, + "learning_rate": 1.7442606966242004e-07, + "loss": 0.04071699380874634, + "memory(GiB)": 129.1, + "step": 965, + "token_acc": 0.9473123191716156, + "train_speed(iter/s)": 0.354442 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 0.4899163246154785, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.04371882379055023, + "memory(GiB)": 129.1, + "step": 970, + "token_acc": 0.9862877911779283, + "train_speed(iter/s)": 0.35482 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.2417013943195343, + "learning_rate": 6.281677086071303e-08, + "loss": 0.02706504464149475, + "memory(GiB)": 129.1, + "step": 975, + "token_acc": 0.9944558521560575, + "train_speed(iter/s)": 0.355295 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 0.11299926042556763, + "learning_rate": 2.792181348726941e-08, + "loss": 0.069762122631073, + "memory(GiB)": 129.1, + "step": 980, + "token_acc": 0.9661454379839077, + "train_speed(iter/s)": 0.355325 + }, + { + "epoch": 4.94949494949495, + "eval_loss": 0.33634763956069946, + "eval_runtime": 1.2882, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 3.105, + "eval_token_acc": 0.7549212598425197, + "step": 980 + }, + { + "epoch": 4.974747474747475, + "grad_norm": 0.333428293466568, + "learning_rate": 6.980940707146389e-09, + "loss": 0.13659827709197997, + "memory(GiB)": 129.1, + "step": 985, + "token_acc": 0.9222326748196927, + "train_speed(iter/s)": 0.354804 + }, + { + "epoch": 5.0, + "grad_norm": 0.2640458941459656, + "learning_rate": 0.0, + "loss": 0.016735257208347322, + "memory(GiB)": 129.1, + "step": 990, + "token_acc": 0.9945504087193461, + "train_speed(iter/s)": 0.355027 + }, + { + "epoch": 5.0, + "eval_loss": 0.3339096009731293, + "eval_runtime": 1.2791, + "eval_samples_per_second": 3.127, + "eval_steps_per_second": 3.127, + "eval_token_acc": 0.7559055118110236, + "step": 990 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.95876726119936e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/training_args.bin b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..07b2e49761b5734cbcf2c2a471961ee74cc1f1e3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6c9cb0acbab470bfb75bbb0e550391c75364a947acbeffea57da7bacf0844ba +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..4e1ba76ffed1466f8f4d60d03fa2f3a8f87ca6d6 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..bb0a6e0a5f769827ddf6c52d036e2959bac25311 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a03a193537f8c972ba53fcf04d6130e6a4e912b3 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..6f83f9883b7e72d6c2646feffb61b3df896663d1 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..8eb577d63862b18aefd82bff3120e843a7fb5b8b Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..28af4713ddf3ee940ab6789f15d73de984b4c8ea Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..5b7e0742098e62b271785a7d16a5c68970916e95 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..8977f64dd4afd86ccf4d0ec848be41268d0e90fc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a1590e7ad0e34cc5890e192584d3d7ffc0044e3d Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..c5503ce6d5bebb6580ae0ec2e1df569ab8a7cbe2 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..85e472f2507d32ec0670b0e2bb98455613867368 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..9d1a3b98587d602d148c7cea763304b781293fe8 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1d98373bac52be5068e9e0833cf8b37d3b7038ca Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..56e08009eaaee7f0447c0d341964c30ed8d9d1e8 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..599ef052fa39a669d4d3dd12c7d118b8b1d40f49 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..f1151bdf7b68b526e58b68511d3d5a2e0c4002a4 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..43376697894269bbf00d3a26861fdc87f4d7b884 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/logging.jsonl b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d9cfca8638944eced280eda4b69629a99b41660 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/logging.jsonl @@ -0,0 +1,251 @@ +{"loss": 0.59511995, "token_acc": 0.83944954, "grad_norm": 0.15499143, "learning_rate": 2e-06, "memory(GiB)": 71.81, "train_speed(iter/s)": 0.207991, "epoch": 0.00505051, "global_step/max_steps": "1/990", "percentage": "0.10%", "elapsed_time": "4s", "remaining_time": "1h 12m 53s"} +{"loss": 0.711007, "token_acc": 0.8298845, "grad_norm": 0.31547654, "learning_rate": 1e-05, "memory(GiB)": 81.23, "train_speed(iter/s)": 0.34062, "epoch": 0.02525253, "global_step/max_steps": "5/990", "percentage": "0.51%", "elapsed_time": "14s", "remaining_time": "46m 55s"} +{"loss": 0.75902724, "token_acc": 0.79567499, "grad_norm": 0.18780562, "learning_rate": 2e-05, "memory(GiB)": 87.6, "train_speed(iter/s)": 0.36629, "epoch": 0.05050505, "global_step/max_steps": "10/990", "percentage": "1.01%", "elapsed_time": "26s", "remaining_time": "43m 57s"} +{"loss": 0.73056569, "token_acc": 0.80007845, "grad_norm": 0.20020846, "learning_rate": 3e-05, "memory(GiB)": 98.05, "train_speed(iter/s)": 0.368677, "epoch": 0.07575758, "global_step/max_steps": "15/990", "percentage": "1.52%", "elapsed_time": "40s", "remaining_time": "43m 39s"} +{"loss": 1.03810511, "token_acc": 0.85345328, "grad_norm": 3.01246572, "learning_rate": 4e-05, "memory(GiB)": 98.05, "train_speed(iter/s)": 0.388448, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "51s", "remaining_time": "41m 18s"} +{"eval_loss": 1.60395765, "eval_token_acc": 0.71751969, "eval_runtime": 1.3108, "eval_samples_per_second": 3.051, "eval_steps_per_second": 3.051, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "52s", "remaining_time": "42m 22s"} +{"loss": 0.53468814, "token_acc": 0.83041618, "grad_norm": 0.24263285, "learning_rate": 5e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.355688, "epoch": 0.12626263, "global_step/max_steps": "25/990", "percentage": "2.53%", "elapsed_time": "1m 9s", "remaining_time": "44m 58s"} +{"loss": 0.62092094, "token_acc": 0.81556614, "grad_norm": 0.49346253, "learning_rate": 6e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.368676, "epoch": 0.15151515, "global_step/max_steps": "30/990", "percentage": "3.03%", "elapsed_time": "1m 20s", "remaining_time": "43m 11s"} +{"loss": 0.36601179, "token_acc": 0.85580159, "grad_norm": 0.15310031, "learning_rate": 7e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.37743, "epoch": 0.17676768, "global_step/max_steps": "35/990", "percentage": "3.54%", "elapsed_time": "1m 32s", "remaining_time": "41m 59s"} +{"loss": 0.44290204, "token_acc": 0.84572923, "grad_norm": 0.11761368, "learning_rate": 8e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.380325, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "1m 44s", "remaining_time": "41m 28s"} +{"eval_loss": 0.57906944, "eval_token_acc": 0.74409449, "eval_runtime": 1.3071, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "1m 46s", "remaining_time": "41m 59s"} +{"loss": 0.45088105, "token_acc": 0.83640974, "grad_norm": 0.20896034, "learning_rate": 9e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.367432, "epoch": 0.22727273, "global_step/max_steps": "45/990", "percentage": "4.55%", "elapsed_time": "2m 2s", "remaining_time": "42m 43s"} +{"loss": 0.50381284, "token_acc": 0.8802866, "grad_norm": 0.14679442, "learning_rate": 0.0001, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.35784, "epoch": 0.25252525, "global_step/max_steps": "50/990", "percentage": "5.05%", "elapsed_time": "2m 19s", "remaining_time": "43m 39s"} +{"loss": 0.45382376, "token_acc": 0.85119667, "grad_norm": 0.19582348, "learning_rate": 9.999e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.358299, "epoch": 0.27777778, "global_step/max_steps": "55/990", "percentage": "5.56%", "elapsed_time": "2m 33s", "remaining_time": "43m 22s"} +{"loss": 0.38322082, "token_acc": 0.85990605, "grad_norm": 0.22766544, "learning_rate": 9.997e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.366204, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "2m 43s", "remaining_time": "42m 13s"} +{"eval_loss": 0.5421443, "eval_token_acc": 0.7519685, "eval_runtime": 1.3042, "eval_samples_per_second": 3.067, "eval_steps_per_second": 3.067, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "2m 44s", "remaining_time": "42m 33s"} +{"loss": 0.55069761, "token_acc": 0.82621883, "grad_norm": 0.28758597, "learning_rate": 9.994e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.359199, "epoch": 0.32828283, "global_step/max_steps": "65/990", "percentage": "6.57%", "elapsed_time": "3m 0s", "remaining_time": "42m 49s"} +{"loss": 0.41149077, "token_acc": 0.87373354, "grad_norm": 0.11081423, "learning_rate": 9.989e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.356847, "epoch": 0.35353535, "global_step/max_steps": "70/990", "percentage": "7.07%", "elapsed_time": "3m 15s", "remaining_time": "42m 53s"} +{"loss": 0.57968364, "token_acc": 0.82773991, "grad_norm": 0.16865833, "learning_rate": 9.983e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.363118, "epoch": 0.37878788, "global_step/max_steps": "75/990", "percentage": "7.58%", "elapsed_time": "3m 26s", "remaining_time": "41m 55s"} +{"loss": 0.6482029, "token_acc": 0.84187344, "grad_norm": 0.17173523, "learning_rate": 9.975e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.364963, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "3m 38s", "remaining_time": "41m 29s"} +{"eval_loss": 0.53495944, "eval_token_acc": 0.74901575, "eval_runtime": 1.2882, "eval_samples_per_second": 3.105, "eval_steps_per_second": 3.105, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "3m 40s", "remaining_time": "41m 43s"} +{"loss": 0.52915583, "token_acc": 0.81985044, "grad_norm": 0.13993062, "learning_rate": 9.966e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.360238, "epoch": 0.42929293, "global_step/max_steps": "85/990", "percentage": "8.59%", "elapsed_time": "3m 55s", "remaining_time": "41m 48s"} +{"loss": 0.43833165, "token_acc": 0.84564103, "grad_norm": 0.33867121, "learning_rate": 9.955e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.36297, "epoch": 0.45454545, "global_step/max_steps": "90/990", "percentage": "9.09%", "elapsed_time": "4m 7s", "remaining_time": "41m 15s"} +{"loss": 0.3532156, "token_acc": 0.86914721, "grad_norm": 0.12106973, "learning_rate": 9.944e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.36177, "epoch": 0.47979798, "global_step/max_steps": "95/990", "percentage": "9.60%", "elapsed_time": "4m 22s", "remaining_time": "41m 10s"} +{"loss": 0.41011238, "token_acc": 0.86714761, "grad_norm": 0.13638309, "learning_rate": 9.93e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357637, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "4m 39s", "remaining_time": "41m 25s"} +{"eval_loss": 0.52577609, "eval_token_acc": 0.75885827, "eval_runtime": 1.2863, "eval_samples_per_second": 3.11, "eval_steps_per_second": 3.11, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "4m 40s", "remaining_time": "41m 36s"} +{"loss": 0.54750195, "token_acc": 0.80721003, "grad_norm": 0.18218821, "learning_rate": 9.916e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356489, "epoch": 0.53030303, "global_step/max_steps": "105/990", "percentage": "10.61%", "elapsed_time": "4m 54s", "remaining_time": "41m 19s"} +{"loss": 0.45719471, "token_acc": 0.82269504, "grad_norm": 0.64565933, "learning_rate": 9.9e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.362336, "epoch": 0.55555556, "global_step/max_steps": "110/990", "percentage": "11.11%", "elapsed_time": "5m 3s", "remaining_time": "40m 25s"} +{"loss": 0.44896727, "token_acc": 0.86722674, "grad_norm": 0.09764645, "learning_rate": 9.882e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357594, "epoch": 0.58080808, "global_step/max_steps": "115/990", "percentage": "11.62%", "elapsed_time": "5m 21s", "remaining_time": "40m 43s"} +{"loss": 0.50430727, "token_acc": 0.81380496, "grad_norm": 0.30668208, "learning_rate": 9.864e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357072, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "5m 35s", "remaining_time": "40m 33s"} +{"eval_loss": 0.51913744, "eval_token_acc": 0.76181102, "eval_runtime": 1.2824, "eval_samples_per_second": 3.119, "eval_steps_per_second": 3.119, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "5m 36s", "remaining_time": "40m 42s"} +{"loss": 0.47657542, "token_acc": 0.83701435, "grad_norm": 0.20023687, "learning_rate": 9.844e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356444, "epoch": 0.63131313, "global_step/max_steps": "125/990", "percentage": "12.63%", "elapsed_time": "5m 50s", "remaining_time": "40m 24s"} +{"loss": 0.48636141, "token_acc": 0.84085792, "grad_norm": 0.21041052, "learning_rate": 9.822e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356648, "epoch": 0.65656566, "global_step/max_steps": "130/990", "percentage": "13.13%", "elapsed_time": "6m 4s", "remaining_time": "40m 8s"} +{"loss": 0.36075943, "token_acc": 0.87045876, "grad_norm": 0.10671097, "learning_rate": 9.8e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356657, "epoch": 0.68181818, "global_step/max_steps": "135/990", "percentage": "13.64%", "elapsed_time": "6m 18s", "remaining_time": "39m 54s"} +{"loss": 0.53395758, "token_acc": 0.82784875, "grad_norm": 0.21016854, "learning_rate": 9.776e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356881, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "6m 31s", "remaining_time": "39m 39s"} +{"eval_loss": 0.47979286, "eval_token_acc": 0.75787402, "eval_runtime": 1.306, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "6m 33s", "remaining_time": "39m 47s"} +{"loss": 0.36088202, "token_acc": 0.85711327, "grad_norm": 0.2102388, "learning_rate": 9.75e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.353305, "epoch": 0.73232323, "global_step/max_steps": "145/990", "percentage": "14.65%", "elapsed_time": "6m 50s", "remaining_time": "39m 49s"} +{"loss": 0.37517111, "token_acc": 0.88222555, "grad_norm": 0.45031449, "learning_rate": 9.723e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354538, "epoch": 0.75757576, "global_step/max_steps": "150/990", "percentage": "15.15%", "elapsed_time": "7m 2s", "remaining_time": "39m 27s"} +{"loss": 0.41892757, "token_acc": 0.85840166, "grad_norm": 0.23807578, "learning_rate": 9.695e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354578, "epoch": 0.78282828, "global_step/max_steps": "155/990", "percentage": "15.66%", "elapsed_time": "7m 16s", "remaining_time": "39m 12s"} +{"loss": 0.33987851, "token_acc": 0.88178818, "grad_norm": 0.19749346, "learning_rate": 9.666e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356324, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "7m 28s", "remaining_time": "38m 47s"} +{"eval_loss": 0.46302199, "eval_token_acc": 0.75590551, "eval_runtime": 1.2753, "eval_samples_per_second": 3.137, "eval_steps_per_second": 3.137, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "7m 29s", "remaining_time": "38m 53s"} +{"loss": 0.41779904, "token_acc": 0.84991435, "grad_norm": 0.14096139, "learning_rate": 9.635e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354285, "epoch": 0.83333333, "global_step/max_steps": "165/990", "percentage": "16.67%", "elapsed_time": "7m 45s", "remaining_time": "38m 46s"} +{"loss": 0.45575361, "token_acc": 0.85244189, "grad_norm": 0.20421652, "learning_rate": 9.603e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.35386, "epoch": 0.85858586, "global_step/max_steps": "170/990", "percentage": "17.17%", "elapsed_time": "8m 0s", "remaining_time": "38m 35s"} +{"loss": 0.41198802, "token_acc": 0.8568083, "grad_norm": 0.27700654, "learning_rate": 9.57e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.35561, "epoch": 0.88383838, "global_step/max_steps": "175/990", "percentage": "17.68%", "elapsed_time": "8m 11s", "remaining_time": "38m 10s"} +{"loss": 0.4143014, "token_acc": 0.86226505, "grad_norm": 0.19533886, "learning_rate": 9.535e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356495, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "8m 24s", "remaining_time": "37m 50s"} +{"eval_loss": 0.47494349, "eval_token_acc": 0.76574803, "eval_runtime": 1.271, "eval_samples_per_second": 3.147, "eval_steps_per_second": 3.147, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "8m 25s", "remaining_time": "37m 56s"} +{"loss": 0.49581571, "token_acc": 0.81718946, "grad_norm": 0.25879252, "learning_rate": 9.5e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355245, "epoch": 0.93434343, "global_step/max_steps": "185/990", "percentage": "18.69%", "elapsed_time": "8m 40s", "remaining_time": "37m 44s"} +{"loss": 0.56606345, "token_acc": 0.80100474, "grad_norm": 2.98516035, "learning_rate": 9.463e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357613, "epoch": 0.95959596, "global_step/max_steps": "190/990", "percentage": "19.19%", "elapsed_time": "8m 50s", "remaining_time": "37m 15s"} +{"loss": 0.56026711, "token_acc": 0.8152332, "grad_norm": 0.15361513, "learning_rate": 9.424e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357337, "epoch": 0.98484848, "global_step/max_steps": "195/990", "percentage": "19.70%", "elapsed_time": "9m 5s", "remaining_time": "37m 3s"} +{"loss": 0.50358634, "token_acc": 0.84823745, "grad_norm": 0.15566339, "learning_rate": 9.385e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353905, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "9m 24s", "remaining_time": "37m 10s"} +{"eval_loss": 0.4684031, "eval_token_acc": 0.76476378, "eval_runtime": 1.2829, "eval_samples_per_second": 3.118, "eval_steps_per_second": 3.118, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "9m 26s", "remaining_time": "37m 15s"} +{"loss": 0.45102987, "token_acc": 0.83515121, "grad_norm": 0.1639636, "learning_rate": 9.344e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350077, "epoch": 1.03535354, "global_step/max_steps": "205/990", "percentage": "20.71%", "elapsed_time": "9m 45s", "remaining_time": "37m 20s"} +{"loss": 0.42268276, "token_acc": 0.86158916, "grad_norm": 0.20765099, "learning_rate": 9.302e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350194, "epoch": 1.06060606, "global_step/max_steps": "210/990", "percentage": "21.21%", "elapsed_time": "9m 59s", "remaining_time": "37m 5s"} +{"loss": 0.38046865, "token_acc": 0.87460399, "grad_norm": 0.1713054, "learning_rate": 9.259e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349574, "epoch": 1.08585859, "global_step/max_steps": "215/990", "percentage": "21.72%", "elapsed_time": "10m 14s", "remaining_time": "36m 55s"} +{"loss": 0.35848026, "token_acc": 0.87960545, "grad_norm": 0.31240323, "learning_rate": 9.214e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350614, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "10m 27s", "remaining_time": "36m 34s"} +{"eval_loss": 0.47862387, "eval_token_acc": 0.75492126, "eval_runtime": 1.265, "eval_samples_per_second": 3.162, "eval_steps_per_second": 3.162, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "10m 28s", "remaining_time": "36m 39s"} +{"loss": 0.48271027, "token_acc": 0.84294257, "grad_norm": 0.36063838, "learning_rate": 9.169e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349116, "epoch": 1.13636364, "global_step/max_steps": "225/990", "percentage": "22.73%", "elapsed_time": "10m 44s", "remaining_time": "36m 29s"} +{"loss": 0.39199128, "token_acc": 0.86866776, "grad_norm": 0.21145633, "learning_rate": 9.122e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.347125, "epoch": 1.16161616, "global_step/max_steps": "230/990", "percentage": "23.23%", "elapsed_time": "11m 2s", "remaining_time": "36m 28s"} +{"loss": 0.20283978, "token_acc": 0.9086758, "grad_norm": 0.24240956, "learning_rate": 9.074e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.34928, "epoch": 1.18686869, "global_step/max_steps": "235/990", "percentage": "23.74%", "elapsed_time": "11m 12s", "remaining_time": "36m 0s"} +{"loss": 0.44605064, "token_acc": 0.85399091, "grad_norm": 0.40218166, "learning_rate": 9.025e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350502, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "11m 24s", "remaining_time": "35m 38s"} +{"eval_loss": 0.47736126, "eval_token_acc": 0.76279528, "eval_runtime": 1.2844, "eval_samples_per_second": 3.114, "eval_steps_per_second": 3.114, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "11m 25s", "remaining_time": "35m 42s"} +{"loss": 0.28942854, "token_acc": 0.85619648, "grad_norm": 0.24721949, "learning_rate": 8.975e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35052, "epoch": 1.23737374, "global_step/max_steps": "245/990", "percentage": "24.75%", "elapsed_time": "11m 38s", "remaining_time": "35m 24s"} +{"loss": 0.42176371, "token_acc": 0.84974781, "grad_norm": 0.49620357, "learning_rate": 8.924e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352588, "epoch": 1.26262626, "global_step/max_steps": "250/990", "percentage": "25.25%", "elapsed_time": "11m 48s", "remaining_time": "34m 57s"} +{"loss": 0.37904081, "token_acc": 0.85962824, "grad_norm": 0.25094041, "learning_rate": 8.872e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.351148, "epoch": 1.28787879, "global_step/max_steps": "255/990", "percentage": "25.76%", "elapsed_time": "12m 5s", "remaining_time": "34m 52s"} +{"loss": 0.38634362, "token_acc": 0.85502244, "grad_norm": 0.39996538, "learning_rate": 8.818e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352259, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "12m 17s", "remaining_time": "34m 31s"} +{"eval_loss": 0.43004662, "eval_token_acc": 0.77066929, "eval_runtime": 1.2631, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "12m 18s", "remaining_time": "34m 34s"} +{"loss": 0.30726192, "token_acc": 0.89507048, "grad_norm": 0.13070877, "learning_rate": 8.764e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.348668, "epoch": 1.33838384, "global_step/max_steps": "265/990", "percentage": "26.77%", "elapsed_time": "12m 39s", "remaining_time": "34m 38s"} +{"loss": 0.28099468, "token_acc": 0.89453861, "grad_norm": 0.29351169, "learning_rate": 8.708e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349732, "epoch": 1.36363636, "global_step/max_steps": "270/990", "percentage": "27.27%", "elapsed_time": "12m 51s", "remaining_time": "34m 17s"} +{"loss": 0.29236732, "token_acc": 0.89542484, "grad_norm": 0.29749787, "learning_rate": 8.652e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349425, "epoch": 1.38888889, "global_step/max_steps": "275/990", "percentage": "27.78%", "elapsed_time": "13m 6s", "remaining_time": "34m 5s"} +{"loss": 0.42062225, "token_acc": 0.86478632, "grad_norm": 0.58734351, "learning_rate": 8.594e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350425, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "13m 18s", "remaining_time": "33m 45s"} +{"eval_loss": 0.40438846, "eval_token_acc": 0.7726378, "eval_runtime": 1.2688, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "13m 19s", "remaining_time": "33m 48s"} +{"loss": 0.23297641, "token_acc": 0.88995543, "grad_norm": 0.52940971, "learning_rate": 8.536e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350232, "epoch": 1.43939394, "global_step/max_steps": "285/990", "percentage": "28.79%", "elapsed_time": "13m 33s", "remaining_time": "33m 31s"} +{"loss": 0.36497815, "token_acc": 0.86239644, "grad_norm": 0.45061117, "learning_rate": 8.476e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.351651, "epoch": 1.46464646, "global_step/max_steps": "290/990", "percentage": "29.29%", "elapsed_time": "13m 44s", "remaining_time": "33m 9s"} +{"loss": 0.39626932, "token_acc": 0.85288089, "grad_norm": 0.30804959, "learning_rate": 8.415e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353152, "epoch": 1.48989899, "global_step/max_steps": "295/990", "percentage": "29.80%", "elapsed_time": "13m 54s", "remaining_time": "32m 47s"} +{"loss": 0.32068715, "token_acc": 0.88614938, "grad_norm": 1.60680878, "learning_rate": 8.354e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353426, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "14m 8s", "remaining_time": "32m 31s"} +{"eval_loss": 0.42214763, "eval_token_acc": 0.77755906, "eval_runtime": 1.2774, "eval_samples_per_second": 3.131, "eval_steps_per_second": 3.131, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "14m 9s", "remaining_time": "32m 34s"} +{"loss": 0.3652679, "token_acc": 0.86865866, "grad_norm": 0.36006844, "learning_rate": 8.291e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350995, "epoch": 1.54040404, "global_step/max_steps": "305/990", "percentage": "30.81%", "elapsed_time": "14m 28s", "remaining_time": "32m 30s"} +{"loss": 0.30328705, "token_acc": 0.89976134, "grad_norm": 0.33816397, "learning_rate": 8.228e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352277, "epoch": 1.56565657, "global_step/max_steps": "310/990", "percentage": "31.31%", "elapsed_time": "14m 39s", "remaining_time": "32m 9s"} +{"loss": 0.32169333, "token_acc": 0.8734252, "grad_norm": 0.84992385, "learning_rate": 8.164e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353612, "epoch": 1.59090909, "global_step/max_steps": "315/990", "percentage": "31.82%", "elapsed_time": "14m 50s", "remaining_time": "31m 48s"} +{"loss": 0.33423264, "token_acc": 0.90032204, "grad_norm": 0.37628281, "learning_rate": 8.099e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354058, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "15m 3s", "remaining_time": "31m 31s"} +{"eval_loss": 0.4008275, "eval_token_acc": 0.7726378, "eval_runtime": 1.2671, "eval_samples_per_second": 3.157, "eval_steps_per_second": 3.157, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "15m 4s", "remaining_time": "31m 34s"} +{"loss": 0.35271802, "token_acc": 0.85239001, "grad_norm": 0.3769632, "learning_rate": 8.033e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353524, "epoch": 1.64141414, "global_step/max_steps": "325/990", "percentage": "32.83%", "elapsed_time": "15m 18s", "remaining_time": "31m 20s"} +{"loss": 0.38188653, "token_acc": 0.88395623, "grad_norm": 0.57467413, "learning_rate": 7.966e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35462, "epoch": 1.66666667, "global_step/max_steps": "330/990", "percentage": "33.33%", "elapsed_time": "15m 30s", "remaining_time": "31m 0s"} +{"loss": 0.3810936, "token_acc": 0.871084, "grad_norm": 0.31084558, "learning_rate": 7.898e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354627, "epoch": 1.69191919, "global_step/max_steps": "335/990", "percentage": "33.84%", "elapsed_time": "15m 44s", "remaining_time": "30m 46s"} +{"loss": 0.26326938, "token_acc": 0.90641361, "grad_norm": 0.78373098, "learning_rate": 7.83e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355662, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "15m 55s", "remaining_time": "30m 26s"} +{"eval_loss": 0.36019456, "eval_token_acc": 0.7726378, "eval_runtime": 1.2553, "eval_samples_per_second": 3.186, "eval_steps_per_second": 3.186, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "15m 56s", "remaining_time": "30m 29s"} +{"loss": 0.28197329, "token_acc": 0.89460291, "grad_norm": 0.32694358, "learning_rate": 7.76e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35432, "epoch": 1.74242424, "global_step/max_steps": "345/990", "percentage": "34.85%", "elapsed_time": "16m 13s", "remaining_time": "30m 19s"} +{"loss": 0.16738427, "token_acc": 0.92150943, "grad_norm": 1.02787757, "learning_rate": 7.69e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355229, "epoch": 1.76767677, "global_step/max_steps": "350/990", "percentage": "35.35%", "elapsed_time": "16m 24s", "remaining_time": "30m 0s"} +{"loss": 0.47132893, "token_acc": 0.84775465, "grad_norm": 0.61679125, "learning_rate": 7.62e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355466, "epoch": 1.79292929, "global_step/max_steps": "355/990", "percentage": "35.86%", "elapsed_time": "16m 38s", "remaining_time": "29m 45s"} +{"loss": 0.25980012, "token_acc": 0.89681096, "grad_norm": 0.26643774, "learning_rate": 7.548e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355753, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "16m 51s", "remaining_time": "29m 30s"} +{"eval_loss": 0.37819037, "eval_token_acc": 0.77854331, "eval_runtime": 1.3003, "eval_samples_per_second": 3.076, "eval_steps_per_second": 3.076, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "16m 52s", "remaining_time": "29m 32s"} +{"loss": 0.38479092, "token_acc": 0.85471509, "grad_norm": 0.70963585, "learning_rate": 7.476e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355576, "epoch": 1.84343434, "global_step/max_steps": "365/990", "percentage": "36.87%", "elapsed_time": "17m 6s", "remaining_time": "29m 17s"} +{"loss": 0.3501107, "token_acc": 0.87635735, "grad_norm": 0.28540483, "learning_rate": 7.403e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355573, "epoch": 1.86868687, "global_step/max_steps": "370/990", "percentage": "37.37%", "elapsed_time": "17m 20s", "remaining_time": "29m 3s"} +{"loss": 0.40884962, "token_acc": 0.86528253, "grad_norm": 0.43131444, "learning_rate": 7.329e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355232, "epoch": 1.89393939, "global_step/max_steps": "375/990", "percentage": "37.88%", "elapsed_time": "17m 35s", "remaining_time": "28m 50s"} +{"loss": 0.35481386, "token_acc": 0.88798791, "grad_norm": 0.60752594, "learning_rate": 7.255e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35565, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "17m 48s", "remaining_time": "28m 34s"} +{"eval_loss": 0.37313604, "eval_token_acc": 0.77066929, "eval_runtime": 1.2667, "eval_samples_per_second": 3.158, "eval_steps_per_second": 3.158, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "17m 49s", "remaining_time": "28m 36s"} +{"loss": 0.37571774, "token_acc": 0.88282878, "grad_norm": 0.16390289, "learning_rate": 7.18e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354333, "epoch": 1.94444444, "global_step/max_steps": "385/990", "percentage": "38.89%", "elapsed_time": "18m 6s", "remaining_time": "28m 26s"} +{"loss": 0.25887558, "token_acc": 0.89801081, "grad_norm": 0.3281329, "learning_rate": 7.105e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353935, "epoch": 1.96969697, "global_step/max_steps": "390/990", "percentage": "39.39%", "elapsed_time": "18m 21s", "remaining_time": "28m 14s"} +{"loss": 0.25436189, "token_acc": 0.90065183, "grad_norm": 0.31314793, "learning_rate": 7.029e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354813, "epoch": 1.99494949, "global_step/max_steps": "395/990", "percentage": "39.90%", "elapsed_time": "18m 32s", "remaining_time": "27m 56s"} +{"loss": 0.22614222, "token_acc": 0.94006903, "grad_norm": 0.58028269, "learning_rate": 6.952e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356043, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "18m 43s", "remaining_time": "27m 36s"} +{"eval_loss": 0.35362297, "eval_token_acc": 0.77066929, "eval_runtime": 1.2885, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "18m 44s", "remaining_time": "27m 38s"} +{"loss": 0.15614408, "token_acc": 0.9098952, "grad_norm": 0.61919498, "learning_rate": 6.875e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355365, "epoch": 2.04545455, "global_step/max_steps": "405/990", "percentage": "40.91%", "elapsed_time": "18m 59s", "remaining_time": "27m 25s"} +{"loss": 0.27334342, "token_acc": 0.91889667, "grad_norm": 0.77250832, "learning_rate": 6.797e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355292, "epoch": 2.07070707, "global_step/max_steps": "410/990", "percentage": "41.41%", "elapsed_time": "19m 13s", "remaining_time": "27m 11s"} +{"loss": 0.19493244, "token_acc": 0.92269807, "grad_norm": 0.37131822, "learning_rate": 6.719e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355986, "epoch": 2.0959596, "global_step/max_steps": "415/990", "percentage": "41.92%", "elapsed_time": "19m 25s", "remaining_time": "26m 54s"} +{"loss": 0.09827778, "token_acc": 0.97174134, "grad_norm": 0.28456211, "learning_rate": 6.64e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356829, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "19m 36s", "remaining_time": "26m 36s"} +{"eval_loss": 0.35825771, "eval_token_acc": 0.76673228, "eval_runtime": 1.2794, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "19m 37s", "remaining_time": "26m 38s"} +{"loss": 0.19055347, "token_acc": 0.89196273, "grad_norm": 0.60775864, "learning_rate": 6.561e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356579, "epoch": 2.14646465, "global_step/max_steps": "425/990", "percentage": "42.93%", "elapsed_time": "19m 51s", "remaining_time": "26m 23s"} +{"loss": 0.1357766, "token_acc": 0.95816373, "grad_norm": 0.49307492, "learning_rate": 6.481e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356034, "epoch": 2.17171717, "global_step/max_steps": "430/990", "percentage": "43.43%", "elapsed_time": "20m 7s", "remaining_time": "26m 12s"} +{"loss": 0.18788767, "token_acc": 0.9421219, "grad_norm": 0.26895335, "learning_rate": 6.401e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355275, "epoch": 2.1969697, "global_step/max_steps": "435/990", "percentage": "43.94%", "elapsed_time": "20m 24s", "remaining_time": "26m 1s"} +{"loss": 0.21257937, "token_acc": 0.9272428, "grad_norm": 0.99798816, "learning_rate": 6.321e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355873, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "20m 36s", "remaining_time": "25m 45s"} +{"eval_loss": 0.3688502, "eval_token_acc": 0.77066929, "eval_runtime": 1.2802, "eval_samples_per_second": 3.124, "eval_steps_per_second": 3.124, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "20m 37s", "remaining_time": "25m 46s"} +{"loss": 0.11968926, "token_acc": 0.93040943, "grad_norm": 0.37610239, "learning_rate": 6.24e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355188, "epoch": 2.24747475, "global_step/max_steps": "445/990", "percentage": "44.95%", "elapsed_time": "20m 52s", "remaining_time": "25m 33s"} +{"loss": 0.26689649, "token_acc": 0.90589548, "grad_norm": 0.71591049, "learning_rate": 6.159e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355771, "epoch": 2.27272727, "global_step/max_steps": "450/990", "percentage": "45.45%", "elapsed_time": "21m 4s", "remaining_time": "25m 17s"} +{"loss": 0.18547212, "token_acc": 0.93247423, "grad_norm": 0.29641232, "learning_rate": 6.078e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355991, "epoch": 2.2979798, "global_step/max_steps": "455/990", "percentage": "45.96%", "elapsed_time": "21m 17s", "remaining_time": "25m 2s"} +{"loss": 0.19393576, "token_acc": 0.93776824, "grad_norm": 0.25186142, "learning_rate": 5.996e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356327, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "21m 30s", "remaining_time": "24m 46s"} +{"eval_loss": 0.36333546, "eval_token_acc": 0.77066929, "eval_runtime": 1.2615, "eval_samples_per_second": 3.171, "eval_steps_per_second": 3.171, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "21m 31s", "remaining_time": "24m 48s"} +{"loss": 0.17664583, "token_acc": 0.89212513, "grad_norm": 0.49181655, "learning_rate": 5.914e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356557, "epoch": 2.34848485, "global_step/max_steps": "465/990", "percentage": "46.97%", "elapsed_time": "21m 43s", "remaining_time": "24m 31s"} +{"loss": 0.13740575, "token_acc": 0.9428539, "grad_norm": 0.56454843, "learning_rate": 5.832e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356147, "epoch": 2.37373737, "global_step/max_steps": "470/990", "percentage": "47.47%", "elapsed_time": "21m 59s", "remaining_time": "24m 19s"} +{"loss": 0.21261225, "token_acc": 0.92061974, "grad_norm": 1.33456028, "learning_rate": 5.749e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356639, "epoch": 2.3989899, "global_step/max_steps": "475/990", "percentage": "47.98%", "elapsed_time": "22m 11s", "remaining_time": "24m 3s"} +{"loss": 0.20756485, "token_acc": 0.92529072, "grad_norm": 0.32525605, "learning_rate": 5.666e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355528, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "22m 29s", "remaining_time": "23m 54s"} +{"eval_loss": 0.36280358, "eval_token_acc": 0.76673228, "eval_runtime": 1.2645, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "22m 30s", "remaining_time": "23m 55s"} +{"loss": 0.13927946, "token_acc": 0.89843927, "grad_norm": 0.62448323, "learning_rate": 5.584e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355772, "epoch": 2.44949495, "global_step/max_steps": "485/990", "percentage": "48.99%", "elapsed_time": "22m 42s", "remaining_time": "23m 39s"} +{"loss": 0.28371801, "token_acc": 0.89282836, "grad_norm": 0.54572952, "learning_rate": 5.5e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355994, "epoch": 2.47474747, "global_step/max_steps": "490/990", "percentage": "49.49%", "elapsed_time": "22m 56s", "remaining_time": "23m 24s"} +{"loss": 0.16324831, "token_acc": 0.93821839, "grad_norm": 0.26068228, "learning_rate": 5.417e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356737, "epoch": 2.5, "global_step/max_steps": "495/990", "percentage": "50.00%", "elapsed_time": "23m 7s", "remaining_time": "23m 7s"} +{"loss": 0.23240807, "token_acc": 0.89189855, "grad_norm": 0.47255304, "learning_rate": 5.334e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356548, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "23m 21s", "remaining_time": "22m 53s"} +{"eval_loss": 0.3651616, "eval_token_acc": 0.77066929, "eval_runtime": 1.2784, "eval_samples_per_second": 3.129, "eval_steps_per_second": 3.129, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "23m 23s", "remaining_time": "22m 55s"} +{"loss": 0.1586161, "token_acc": 0.90600259, "grad_norm": 0.59906375, "learning_rate": 5.251e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356161, "epoch": 2.55050505, "global_step/max_steps": "505/990", "percentage": "51.01%", "elapsed_time": "23m 37s", "remaining_time": "22m 41s"} +{"loss": 0.16548033, "token_acc": 0.93703542, "grad_norm": 0.70702755, "learning_rate": 5.167e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357031, "epoch": 2.57575758, "global_step/max_steps": "510/990", "percentage": "51.52%", "elapsed_time": "23m 48s", "remaining_time": "22m 24s"} +{"loss": 0.18845116, "token_acc": 0.92710425, "grad_norm": 0.30997011, "learning_rate": 5.084e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357488, "epoch": 2.6010101, "global_step/max_steps": "515/990", "percentage": "52.02%", "elapsed_time": "24m 0s", "remaining_time": "22m 8s"} +{"loss": 0.19470885, "token_acc": 0.91886224, "grad_norm": 0.34516478, "learning_rate": 5e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357567, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "24m 13s", "remaining_time": "21m 54s"} +{"eval_loss": 0.3530367, "eval_token_acc": 0.76574803, "eval_runtime": 1.2716, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "24m 15s", "remaining_time": "21m 55s"} +{"loss": 0.16326183, "token_acc": 0.91700155, "grad_norm": 0.40084693, "learning_rate": 4.916e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357449, "epoch": 2.65151515, "global_step/max_steps": "525/990", "percentage": "53.03%", "elapsed_time": "24m 28s", "remaining_time": "21m 40s"} +{"loss": 0.22751577, "token_acc": 0.91101314, "grad_norm": 0.35263517, "learning_rate": 4.833e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356693, "epoch": 2.67676768, "global_step/max_steps": "530/990", "percentage": "53.54%", "elapsed_time": "24m 45s", "remaining_time": "21m 29s"} +{"loss": 0.16660224, "token_acc": 0.9465856, "grad_norm": 0.77503324, "learning_rate": 4.749e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356093, "epoch": 2.7020202, "global_step/max_steps": "535/990", "percentage": "54.04%", "elapsed_time": "25m 2s", "remaining_time": "21m 17s"} +{"loss": 0.29327481, "token_acc": 0.89345698, "grad_norm": 1.00823629, "learning_rate": 4.666e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355675, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "25m 17s", "remaining_time": "21m 4s"} +{"eval_loss": 0.34143466, "eval_token_acc": 0.76476378, "eval_runtime": 1.2634, "eval_samples_per_second": 3.166, "eval_steps_per_second": 3.166, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "25m 19s", "remaining_time": "21m 5s"} +{"loss": 0.22186093, "token_acc": 0.88828243, "grad_norm": 0.506253, "learning_rate": 4.583e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355094, "epoch": 2.75252525, "global_step/max_steps": "545/990", "percentage": "55.05%", "elapsed_time": "25m 34s", "remaining_time": "20m 52s"} +{"loss": 0.17150269, "token_acc": 0.93117289, "grad_norm": 0.4163118, "learning_rate": 4.5e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355204, "epoch": 2.77777778, "global_step/max_steps": "550/990", "percentage": "55.56%", "elapsed_time": "25m 48s", "remaining_time": "20m 38s"} +{"loss": 0.18130356, "token_acc": 0.93726717, "grad_norm": 0.68722183, "learning_rate": 4.416e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35566, "epoch": 2.8030303, "global_step/max_steps": "555/990", "percentage": "56.06%", "elapsed_time": "26m 0s", "remaining_time": "20m 22s"} +{"loss": 0.15754029, "token_acc": 0.92938057, "grad_norm": 0.25893658, "learning_rate": 4.334e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355647, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "26m 14s", "remaining_time": "20m 8s"} +{"eval_loss": 0.32865202, "eval_token_acc": 0.76476378, "eval_runtime": 1.2642, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "26m 15s", "remaining_time": "20m 9s"} +{"loss": 0.20555024, "token_acc": 0.90319499, "grad_norm": 0.68745732, "learning_rate": 4.251e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355089, "epoch": 2.85353535, "global_step/max_steps": "565/990", "percentage": "57.07%", "elapsed_time": "26m 30s", "remaining_time": "19m 56s"} +{"loss": 0.25478287, "token_acc": 0.90388736, "grad_norm": 0.60831439, "learning_rate": 4.168e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354778, "epoch": 2.87878788, "global_step/max_steps": "570/990", "percentage": "57.58%", "elapsed_time": "26m 46s", "remaining_time": "19m 43s"} +{"loss": 0.1764259, "token_acc": 0.93314982, "grad_norm": 0.9529441, "learning_rate": 4.086e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354847, "epoch": 2.9040404, "global_step/max_steps": "575/990", "percentage": "58.08%", "elapsed_time": "27m 0s", "remaining_time": "19m 29s"} +{"loss": 0.24578819, "token_acc": 0.89578802, "grad_norm": 0.17912032, "learning_rate": 4.004e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355458, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "27m 11s", "remaining_time": "19m 13s"} +{"eval_loss": 0.32831043, "eval_token_acc": 0.77066929, "eval_runtime": 1.2645, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "27m 12s", "remaining_time": "19m 14s"} +{"loss": 0.2130367, "token_acc": 0.89207807, "grad_norm": 0.76581943, "learning_rate": 3.922e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355426, "epoch": 2.95454545, "global_step/max_steps": "585/990", "percentage": "59.09%", "elapsed_time": "27m 25s", "remaining_time": "18m 59s"} +{"loss": 0.19972539, "token_acc": 0.93411017, "grad_norm": 0.70901978, "learning_rate": 3.841e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354892, "epoch": 2.97979798, "global_step/max_steps": "590/990", "percentage": "59.60%", "elapsed_time": "27m 42s", "remaining_time": "18m 46s"} +{"loss": 0.22592678, "token_acc": 0.93356998, "grad_norm": 0.26023608, "learning_rate": 3.76e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354377, "epoch": 3.00505051, "global_step/max_steps": "595/990", "percentage": "60.10%", "elapsed_time": "27m 58s", "remaining_time": "18m 34s"} +{"loss": 0.13799512, "token_acc": 0.95704395, "grad_norm": 0.8862583, "learning_rate": 3.679e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354572, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "28m 11s", "remaining_time": "18m 19s"} +{"eval_loss": 0.30323198, "eval_token_acc": 0.77066929, "eval_runtime": 1.2806, "eval_samples_per_second": 3.123, "eval_steps_per_second": 3.123, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "28m 13s", "remaining_time": "18m 20s"} +{"loss": 0.08972094, "token_acc": 0.93971154, "grad_norm": 0.59180546, "learning_rate": 3.599e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.3534, "epoch": 3.05555556, "global_step/max_steps": "605/990", "percentage": "61.11%", "elapsed_time": "28m 31s", "remaining_time": "18m 9s"} +{"loss": 0.04932542, "token_acc": 0.97585312, "grad_norm": 0.70785379, "learning_rate": 3.519e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354057, "epoch": 3.08080808, "global_step/max_steps": "610/990", "percentage": "61.62%", "elapsed_time": "28m 42s", "remaining_time": "17m 53s"} +{"loss": 0.03949524, "token_acc": 0.98925934, "grad_norm": 0.13521677, "learning_rate": 3.439e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354635, "epoch": 3.10606061, "global_step/max_steps": "615/990", "percentage": "62.12%", "elapsed_time": "28m 53s", "remaining_time": "17m 37s"} +{"loss": 0.08747857, "token_acc": 0.96919326, "grad_norm": 0.43032891, "learning_rate": 3.36e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354612, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "29m 8s", "remaining_time": "17m 23s"} +{"eval_loss": 0.30622691, "eval_token_acc": 0.76968504, "eval_runtime": 1.3165, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "29m 9s", "remaining_time": "17m 23s"} +{"loss": 0.09797982, "token_acc": 0.94929966, "grad_norm": 0.37648767, "learning_rate": 3.281e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353631, "epoch": 3.15656566, "global_step/max_steps": "625/990", "percentage": "63.13%", "elapsed_time": "29m 26s", "remaining_time": "17m 11s"} +{"loss": 0.07529097, "token_acc": 0.96984094, "grad_norm": 0.74529874, "learning_rate": 3.203e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354143, "epoch": 3.18181818, "global_step/max_steps": "630/990", "percentage": "63.64%", "elapsed_time": "29m 38s", "remaining_time": "16m 56s"} +{"loss": 0.13252516, "token_acc": 0.94345073, "grad_norm": 0.5807966, "learning_rate": 3.125e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354003, "epoch": 3.20707071, "global_step/max_steps": "635/990", "percentage": "64.14%", "elapsed_time": "29m 53s", "remaining_time": "16m 42s"} +{"loss": 0.11141908, "token_acc": 0.95033393, "grad_norm": 0.45170957, "learning_rate": 3.048e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353366, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "30m 10s", "remaining_time": "16m 30s"} +{"eval_loss": 0.30879176, "eval_token_acc": 0.76574803, "eval_runtime": 1.3167, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "30m 12s", "remaining_time": "16m 30s"} +{"loss": 0.08485085, "token_acc": 0.94528481, "grad_norm": 0.42097673, "learning_rate": 2.971e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352332, "epoch": 3.25757576, "global_step/max_steps": "645/990", "percentage": "65.15%", "elapsed_time": "30m 30s", "remaining_time": "16m 18s"} +{"loss": 0.12707052, "token_acc": 0.95386429, "grad_norm": 0.32493889, "learning_rate": 2.895e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352802, "epoch": 3.28282828, "global_step/max_steps": "650/990", "percentage": "65.66%", "elapsed_time": "30m 42s", "remaining_time": "16m 3s"} +{"loss": 0.0973598, "token_acc": 0.96823225, "grad_norm": 0.4984124, "learning_rate": 2.82e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352963, "epoch": 3.30808081, "global_step/max_steps": "655/990", "percentage": "66.16%", "elapsed_time": "30m 55s", "remaining_time": "15m 48s"} +{"loss": 0.09166446, "token_acc": 0.96317016, "grad_norm": 0.20049615, "learning_rate": 2.745e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353022, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "31m 9s", "remaining_time": "15m 34s"} +{"eval_loss": 0.31010589, "eval_token_acc": 0.76279528, "eval_runtime": 1.2782, "eval_samples_per_second": 3.129, "eval_steps_per_second": 3.129, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "31m 10s", "remaining_time": "15m 35s"} +{"loss": 0.09054632, "token_acc": 0.92311999, "grad_norm": 0.8963623, "learning_rate": 2.671e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353076, "epoch": 3.35858586, "global_step/max_steps": "665/990", "percentage": "67.17%", "elapsed_time": "31m 23s", "remaining_time": "15m 20s"} +{"loss": 0.10762099, "token_acc": 0.96045198, "grad_norm": 0.85323519, "learning_rate": 2.597e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353315, "epoch": 3.38383838, "global_step/max_steps": "670/990", "percentage": "67.68%", "elapsed_time": "31m 35s", "remaining_time": "15m 5s"} +{"loss": 0.07633501, "token_acc": 0.9642299, "grad_norm": 0.36199483, "learning_rate": 2.524e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353653, "epoch": 3.40909091, "global_step/max_steps": "675/990", "percentage": "68.18%", "elapsed_time": "31m 48s", "remaining_time": "14m 50s"} +{"loss": 0.12270306, "token_acc": 0.9420904, "grad_norm": 0.66805679, "learning_rate": 2.452e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354043, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "32m 0s", "remaining_time": "14m 35s"} +{"eval_loss": 0.31025913, "eval_token_acc": 0.75787402, "eval_runtime": 1.2774, "eval_samples_per_second": 3.131, "eval_steps_per_second": 3.131, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "32m 1s", "remaining_time": "14m 36s"} +{"loss": 0.11645561, "token_acc": 0.91122384, "grad_norm": 0.28704885, "learning_rate": 2.38e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353305, "epoch": 3.45959596, "global_step/max_steps": "685/990", "percentage": "69.19%", "elapsed_time": "32m 18s", "remaining_time": "14m 23s"} +{"loss": 0.13571174, "token_acc": 0.95302315, "grad_norm": 0.6660772, "learning_rate": 2.31e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353078, "epoch": 3.48484848, "global_step/max_steps": "690/990", "percentage": "69.70%", "elapsed_time": "32m 33s", "remaining_time": "14m 9s"} +{"loss": 0.11649401, "token_acc": 0.95939873, "grad_norm": 0.3928045, "learning_rate": 2.24e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353484, "epoch": 3.51010101, "global_step/max_steps": "695/990", "percentage": "70.20%", "elapsed_time": "32m 45s", "remaining_time": "13m 54s"} +{"loss": 0.03888607, "token_acc": 0.98571429, "grad_norm": 0.24035466, "learning_rate": 2.17e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354064, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "32m 56s", "remaining_time": "13m 38s"} +{"eval_loss": 0.31438074, "eval_token_acc": 0.75885827, "eval_runtime": 1.2708, "eval_samples_per_second": 3.148, "eval_steps_per_second": 3.148, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "32m 57s", "remaining_time": "13m 39s"} +{"loss": 0.06877344, "token_acc": 0.93323631, "grad_norm": 0.38989666, "learning_rate": 2.102e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354088, "epoch": 3.56060606, "global_step/max_steps": "705/990", "percentage": "71.21%", "elapsed_time": "33m 10s", "remaining_time": "13m 24s"} +{"loss": 0.09043558, "token_acc": 0.96463695, "grad_norm": 0.57702106, "learning_rate": 2.034e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353937, "epoch": 3.58585859, "global_step/max_steps": "710/990", "percentage": "71.72%", "elapsed_time": "33m 25s", "remaining_time": "13m 10s"} +{"loss": 0.00810304, "token_acc": 0.99743918, "grad_norm": 0.0157186, "learning_rate": 1.967e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354851, "epoch": 3.61111111, "global_step/max_steps": "715/990", "percentage": "72.22%", "elapsed_time": "33m 34s", "remaining_time": "12m 54s"} +{"loss": 0.0557899, "token_acc": 0.98127045, "grad_norm": 0.61499214, "learning_rate": 1.901e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354676, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "33m 49s", "remaining_time": "12m 41s"} +{"eval_loss": 0.31738603, "eval_token_acc": 0.76181102, "eval_runtime": 1.2704, "eval_samples_per_second": 3.149, "eval_steps_per_second": 3.149, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "33m 50s", "remaining_time": "12m 41s"} +{"loss": 0.1061089, "token_acc": 0.93141176, "grad_norm": 0.55221421, "learning_rate": 1.836e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354276, "epoch": 3.66161616, "global_step/max_steps": "725/990", "percentage": "73.23%", "elapsed_time": "34m 6s", "remaining_time": "12m 27s"} +{"loss": 0.115659, "token_acc": 0.954354, "grad_norm": 0.32033381, "learning_rate": 1.772e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35397, "epoch": 3.68686869, "global_step/max_steps": "730/990", "percentage": "73.74%", "elapsed_time": "34m 21s", "remaining_time": "12m 14s"} +{"loss": 0.09632692, "token_acc": 0.95803057, "grad_norm": 0.54807562, "learning_rate": 1.709e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353862, "epoch": 3.71212121, "global_step/max_steps": "735/990", "percentage": "74.24%", "elapsed_time": "34m 36s", "remaining_time": "12m 0s"} +{"loss": 0.12017931, "token_acc": 0.96243498, "grad_norm": 0.71036881, "learning_rate": 1.646e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354334, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "34m 48s", "remaining_time": "11m 45s"} +{"eval_loss": 0.31393528, "eval_token_acc": 0.75984252, "eval_runtime": 1.2642, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "34m 49s", "remaining_time": "11m 45s"} +{"loss": 0.06079275, "token_acc": 0.91029095, "grad_norm": 0.36095589, "learning_rate": 1.585e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354549, "epoch": 3.76262626, "global_step/max_steps": "745/990", "percentage": "75.25%", "elapsed_time": "35m 0s", "remaining_time": "11m 30s"} +{"loss": 0.03985467, "token_acc": 0.98336897, "grad_norm": 0.72005308, "learning_rate": 1.524e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355155, "epoch": 3.78787879, "global_step/max_steps": "750/990", "percentage": "75.76%", "elapsed_time": "35m 11s", "remaining_time": "11m 15s"} +{"loss": 0.1864677, "token_acc": 0.92220249, "grad_norm": 0.47436634, "learning_rate": 1.464e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354059, "epoch": 3.81313131, "global_step/max_steps": "755/990", "percentage": "76.26%", "elapsed_time": "35m 32s", "remaining_time": "11m 3s"} +{"loss": 0.01775267, "token_acc": 0.99469777, "grad_norm": 0.41593018, "learning_rate": 1.406e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354791, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "35m 41s", "remaining_time": "10m 48s"} +{"eval_loss": 0.31617671, "eval_token_acc": 0.75492126, "eval_runtime": 1.2713, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "35m 42s", "remaining_time": "10m 48s"} +{"loss": 0.11076927, "token_acc": 0.92013995, "grad_norm": 0.63009536, "learning_rate": 1.348e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354628, "epoch": 3.86363636, "global_step/max_steps": "765/990", "percentage": "77.27%", "elapsed_time": "35m 56s", "remaining_time": "10m 34s"} +{"loss": 0.06400648, "token_acc": 0.9771947, "grad_norm": 0.17331809, "learning_rate": 1.292e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354776, "epoch": 3.88888889, "global_step/max_steps": "770/990", "percentage": "77.78%", "elapsed_time": "36m 9s", "remaining_time": "10m 19s"} +{"loss": 0.09050332, "token_acc": 0.97415747, "grad_norm": 0.5865472, "learning_rate": 1.236e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354814, "epoch": 3.91414141, "global_step/max_steps": "775/990", "percentage": "78.28%", "elapsed_time": "36m 23s", "remaining_time": "10m 5s"} +{"loss": 0.09545594, "token_acc": 0.9562638, "grad_norm": 0.40246558, "learning_rate": 1.182e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354768, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "36m 38s", "remaining_time": "9m 51s"} +{"eval_loss": 0.32290545, "eval_token_acc": 0.75885827, "eval_runtime": 1.2807, "eval_samples_per_second": 3.123, "eval_steps_per_second": 3.123, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "36m 39s", "remaining_time": "9m 52s"} +{"loss": 0.08384126, "token_acc": 0.91694772, "grad_norm": 2.19844842, "learning_rate": 1.128e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354661, "epoch": 3.96464646, "global_step/max_steps": "785/990", "percentage": "79.29%", "elapsed_time": "36m 52s", "remaining_time": "9m 37s"} +{"loss": 0.09766409, "token_acc": 0.96777392, "grad_norm": 0.83991456, "learning_rate": 1.076e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355019, "epoch": 3.98989899, "global_step/max_steps": "790/990", "percentage": "79.80%", "elapsed_time": "37m 4s", "remaining_time": "9m 23s"} +{"loss": 0.08901908, "token_acc": 0.96995775, "grad_norm": 0.51676941, "learning_rate": 1.025e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355271, "epoch": 4.01515152, "global_step/max_steps": "795/990", "percentage": "80.30%", "elapsed_time": "37m 17s", "remaining_time": "9m 8s"} +{"loss": 0.03536704, "token_acc": 0.98747168, "grad_norm": 0.29580003, "learning_rate": 9.75e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355303, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "37m 31s", "remaining_time": "8m 54s"} +{"eval_loss": 0.32690543, "eval_token_acc": 0.75885827, "eval_runtime": 1.2877, "eval_samples_per_second": 3.106, "eval_steps_per_second": 3.106, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "37m 32s", "remaining_time": "8m 54s"} +{"loss": 0.0658075, "token_acc": 0.94917565, "grad_norm": 0.67833024, "learning_rate": 9.26e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354851, "epoch": 4.06565657, "global_step/max_steps": "805/990", "percentage": "81.31%", "elapsed_time": "37m 48s", "remaining_time": "8m 41s"} +{"loss": 0.01897778, "token_acc": 0.99392444, "grad_norm": 0.02713463, "learning_rate": 8.78e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355276, "epoch": 4.09090909, "global_step/max_steps": "810/990", "percentage": "81.82%", "elapsed_time": "37m 59s", "remaining_time": "8m 26s"} +{"loss": 0.04440165, "token_acc": 0.98306101, "grad_norm": 0.47375405, "learning_rate": 8.31e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35558, "epoch": 4.11616162, "global_step/max_steps": "815/990", "percentage": "82.32%", "elapsed_time": "38m 11s", "remaining_time": "8m 12s"} +{"loss": 0.03465362, "token_acc": 0.98840838, "grad_norm": 0.26994729, "learning_rate": 7.86e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355478, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "38m 26s", "remaining_time": "7m 58s"} +{"eval_loss": 0.33224982, "eval_token_acc": 0.75787402, "eval_runtime": 1.266, "eval_samples_per_second": 3.16, "eval_steps_per_second": 3.16, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "38m 27s", "remaining_time": "7m 58s"} +{"loss": 0.06061631, "token_acc": 0.95628474, "grad_norm": 0.46439409, "learning_rate": 7.41e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354922, "epoch": 4.16666667, "global_step/max_steps": "825/990", "percentage": "83.33%", "elapsed_time": "38m 44s", "remaining_time": "7m 44s"} +{"loss": 0.07455227, "token_acc": 0.94440433, "grad_norm": 0.32245392, "learning_rate": 6.98e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354969, "epoch": 4.19191919, "global_step/max_steps": "830/990", "percentage": "83.84%", "elapsed_time": "38m 57s", "remaining_time": "7m 30s"} +{"loss": 0.0996636, "token_acc": 0.95652174, "grad_norm": 0.65794915, "learning_rate": 6.56e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355055, "epoch": 4.21717172, "global_step/max_steps": "835/990", "percentage": "84.34%", "elapsed_time": "39m 11s", "remaining_time": "7m 16s"} +{"loss": 0.07703586, "token_acc": 0.96763858, "grad_norm": 0.44027379, "learning_rate": 6.15e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354947, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "39m 26s", "remaining_time": "7m 2s"} +{"eval_loss": 0.33525062, "eval_token_acc": 0.75688976, "eval_runtime": 1.2662, "eval_samples_per_second": 3.159, "eval_steps_per_second": 3.159, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "39m 27s", "remaining_time": "7m 2s"} +{"loss": 0.14292825, "token_acc": 0.92279307, "grad_norm": 0.83593249, "learning_rate": 5.76e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354142, "epoch": 4.26767677, "global_step/max_steps": "845/990", "percentage": "85.35%", "elapsed_time": "39m 45s", "remaining_time": "6m 49s"} +{"loss": 0.10208558, "token_acc": 0.96112682, "grad_norm": 0.2799941, "learning_rate": 5.37e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353931, "epoch": 4.29292929, "global_step/max_steps": "850/990", "percentage": "85.86%", "elapsed_time": "40m 1s", "remaining_time": "6m 35s"} +{"loss": 0.03595015, "token_acc": 0.98651441, "grad_norm": 0.39005765, "learning_rate": 5e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353884, "epoch": 4.31818182, "global_step/max_steps": "855/990", "percentage": "86.36%", "elapsed_time": "40m 15s", "remaining_time": "6m 21s"} +{"loss": 0.03491878, "token_acc": 0.98842007, "grad_norm": 0.2543757, "learning_rate": 4.65e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353895, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "40m 29s", "remaining_time": "6m 7s"} +{"eval_loss": 0.33576337, "eval_token_acc": 0.75393701, "eval_runtime": 1.2612, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "40m 30s", "remaining_time": "6m 7s"} +{"loss": 0.04228464, "token_acc": 0.95443105, "grad_norm": 0.50258362, "learning_rate": 4.3e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353642, "epoch": 4.36868687, "global_step/max_steps": "865/990", "percentage": "87.37%", "elapsed_time": "40m 45s", "remaining_time": "5m 53s"} +{"loss": 0.0381623, "token_acc": 0.98923369, "grad_norm": 0.61790901, "learning_rate": 3.97e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35391, "epoch": 4.39393939, "global_step/max_steps": "870/990", "percentage": "87.88%", "elapsed_time": "40m 57s", "remaining_time": "5m 39s"} +{"loss": 0.02394222, "token_acc": 0.98653773, "grad_norm": 0.42816147, "learning_rate": 3.65e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354293, "epoch": 4.41919192, "global_step/max_steps": "875/990", "percentage": "88.38%", "elapsed_time": "41m 9s", "remaining_time": "5m 24s"} +{"loss": 0.05464286, "token_acc": 0.97360966, "grad_norm": 0.40714192, "learning_rate": 3.34e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354416, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "41m 22s", "remaining_time": "5m 10s"} +{"eval_loss": 0.33596614, "eval_token_acc": 0.75590551, "eval_runtime": 1.3133, "eval_samples_per_second": 3.046, "eval_steps_per_second": 3.046, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "41m 23s", "remaining_time": "5m 10s"} +{"loss": 0.03318539, "token_acc": 0.95981472, "grad_norm": 0.34820908, "learning_rate": 3.05e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35415, "epoch": 4.46969697, "global_step/max_steps": "885/990", "percentage": "89.39%", "elapsed_time": "41m 38s", "remaining_time": "4m 56s"} +{"loss": 0.0540942, "token_acc": 0.98252743, "grad_norm": 0.4226729, "learning_rate": 2.77e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354092, "epoch": 4.49494949, "global_step/max_steps": "890/990", "percentage": "89.90%", "elapsed_time": "41m 53s", "remaining_time": "4m 42s"} +{"loss": 0.02493553, "token_acc": 0.98607413, "grad_norm": 0.4588812, "learning_rate": 2.5e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354453, "epoch": 4.52020202, "global_step/max_steps": "895/990", "percentage": "90.40%", "elapsed_time": "42m 4s", "remaining_time": "4m 27s"} +{"loss": 0.11395398, "token_acc": 0.9591114, "grad_norm": 0.61124039, "learning_rate": 2.24e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354293, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "42m 19s", "remaining_time": "4m 13s"} +{"eval_loss": 0.33693221, "eval_token_acc": 0.75492126, "eval_runtime": 1.3215, "eval_samples_per_second": 3.027, "eval_steps_per_second": 3.027, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "42m 21s", "remaining_time": "4m 14s"} +{"loss": 0.00986408, "token_acc": 0.95899348, "grad_norm": 0.22429115, "learning_rate": 2e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354216, "epoch": 4.57070707, "global_step/max_steps": "905/990", "percentage": "91.41%", "elapsed_time": "42m 34s", "remaining_time": "3m 59s"} +{"loss": 0.01508615, "token_acc": 0.99517154, "grad_norm": 0.14168987, "learning_rate": 1.78e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354789, "epoch": 4.5959596, "global_step/max_steps": "910/990", "percentage": "91.92%", "elapsed_time": "42m 44s", "remaining_time": "3m 45s"} +{"loss": 0.07223158, "token_acc": 0.97341638, "grad_norm": 0.02749376, "learning_rate": 1.56e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354981, "epoch": 4.62121212, "global_step/max_steps": "915/990", "percentage": "92.42%", "elapsed_time": "42m 57s", "remaining_time": "3m 31s"} +{"loss": 0.02913244, "token_acc": 0.98984772, "grad_norm": 0.22186062, "learning_rate": 1.36e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355396, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "43m 8s", "remaining_time": "3m 16s"} +{"eval_loss": 0.33511752, "eval_token_acc": 0.75492126, "eval_runtime": 1.2779, "eval_samples_per_second": 3.13, "eval_steps_per_second": 3.13, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "43m 9s", "remaining_time": "3m 17s"} +{"loss": 0.01448483, "token_acc": 0.95506692, "grad_norm": 0.53574461, "learning_rate": 1.18e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355269, "epoch": 4.67171717, "global_step/max_steps": "925/990", "percentage": "93.43%", "elapsed_time": "43m 23s", "remaining_time": "3m 2s"} +{"loss": 0.0990447, "token_acc": 0.95625972, "grad_norm": 0.03411055, "learning_rate": 1e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355569, "epoch": 4.6969697, "global_step/max_steps": "930/990", "percentage": "93.94%", "elapsed_time": "43m 35s", "remaining_time": "2m 48s"} +{"loss": 0.09365716, "token_acc": 0.96371316, "grad_norm": 0.46665722, "learning_rate": 8.4e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355182, "epoch": 4.72222222, "global_step/max_steps": "935/990", "percentage": "94.44%", "elapsed_time": "43m 52s", "remaining_time": "2m 34s"} +{"loss": 0.04413899, "token_acc": 0.97450948, "grad_norm": 0.08903439, "learning_rate": 7e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355403, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "44m 4s", "remaining_time": "2m 20s"} +{"eval_loss": 0.33372214, "eval_token_acc": 0.75492126, "eval_runtime": 1.3056, "eval_samples_per_second": 3.064, "eval_steps_per_second": 3.064, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "44m 5s", "remaining_time": "2m 20s"} +{"loss": 0.03972515, "token_acc": 0.96120764, "grad_norm": 0.2658371, "learning_rate": 5.6e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354412, "epoch": 4.77272727, "global_step/max_steps": "945/990", "percentage": "95.45%", "elapsed_time": "44m 26s", "remaining_time": "2m 6s"} +{"loss": 0.0382106, "token_acc": 0.975949, "grad_norm": 0.03347943, "learning_rate": 4.5e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354365, "epoch": 4.7979798, "global_step/max_steps": "950/990", "percentage": "95.96%", "elapsed_time": "44m 40s", "remaining_time": "1m 52s"} +{"loss": 0.01510194, "token_acc": 0.99578837, "grad_norm": 0.37791628, "learning_rate": 3.4e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354476, "epoch": 4.82323232, "global_step/max_steps": "955/990", "percentage": "96.46%", "elapsed_time": "44m 53s", "remaining_time": "1m 38s"} +{"loss": 0.08025824, "token_acc": 0.96340541, "grad_norm": 0.60390872, "learning_rate": 2.5e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354659, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "45m 6s", "remaining_time": "1m 24s"} +{"eval_loss": 0.33536115, "eval_token_acc": 0.75590551, "eval_runtime": 1.2813, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "45m 7s", "remaining_time": "1m 24s"} +{"loss": 0.04071699, "token_acc": 0.94731232, "grad_norm": 0.24173401, "learning_rate": 1.7e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354442, "epoch": 4.87373737, "global_step/max_steps": "965/990", "percentage": "97.47%", "elapsed_time": "45m 22s", "remaining_time": "1m 10s"} +{"loss": 0.04371882, "token_acc": 0.98628779, "grad_norm": 0.48991632, "learning_rate": 1.1e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35482, "epoch": 4.8989899, "global_step/max_steps": "970/990", "percentage": "97.98%", "elapsed_time": "45m 33s", "remaining_time": "56s"} +{"loss": 0.02706504, "token_acc": 0.99445585, "grad_norm": 0.24170139, "learning_rate": 6e-08, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355295, "epoch": 4.92424242, "global_step/max_steps": "975/990", "percentage": "98.48%", "elapsed_time": "45m 43s", "remaining_time": "42s"} +{"loss": 0.06976212, "token_acc": 0.96614544, "grad_norm": 0.11299926, "learning_rate": 3e-08, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355325, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "45m 57s", "remaining_time": "28s"} +{"eval_loss": 0.33634764, "eval_token_acc": 0.75492126, "eval_runtime": 1.2882, "eval_samples_per_second": 3.105, "eval_steps_per_second": 3.105, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "45m 58s", "remaining_time": "28s"} +{"loss": 0.13659828, "token_acc": 0.92223267, "grad_norm": 0.33342829, "learning_rate": 1e-08, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354804, "epoch": 4.97474747, "global_step/max_steps": "985/990", "percentage": "99.49%", "elapsed_time": "46m 15s", "remaining_time": "14s"} +{"loss": 0.01673526, "token_acc": 0.99455041, "grad_norm": 0.26404589, "learning_rate": 0.0, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355027, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 28s", "remaining_time": "0s"} +{"eval_loss": 0.3339096, "eval_token_acc": 0.75590551, "eval_runtime": 1.2791, "eval_samples_per_second": 3.127, "eval_steps_per_second": 3.127, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 29s", "remaining_time": "0s"} +{"train_runtime": 2791.2937, "train_samples_per_second": 0.709, "train_steps_per_second": 0.355, "total_flos": 2.95876726119936e+17, "train_loss": 0.23766249, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "46m 31s", "remaining_time": "0s"} +{"train_dataset": "775.398990±644.578527, min=41.000000, max=4149.000000, size=396", "val_dataset": "311.500000±316.897854, min=85.000000, max=854.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 32898.0941M Params (134.2177M Trainable [0.4080%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-990", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/checkpoint-600", "best_metric": 0.30323198, "global_step": 990, "log_history": [{"loss": 0.5951199531555176, "token_acc": 0.8394495412844036, "grad_norm": 0.15499143302440643, "learning_rate": 2.0000000000000003e-06, "memory(GiB)": 71.81, "train_speed(iter/s)": 0.207991, "epoch": 0.005050505050505051, "step": 1}, {"loss": 0.7110069990158081, "token_acc": 0.8298845043310876, "grad_norm": 0.31547653675079346, "learning_rate": 1e-05, "memory(GiB)": 81.23, "train_speed(iter/s)": 0.34062, "epoch": 0.025252525252525252, "step": 5}, {"loss": 0.7590272426605225, "token_acc": 0.7956749880744156, "grad_norm": 0.18780562281608582, "learning_rate": 2e-05, "memory(GiB)": 87.6, "train_speed(iter/s)": 0.36629, "epoch": 0.050505050505050504, "step": 10}, {"loss": 0.7305656909942627, "token_acc": 0.8000784518828452, "grad_norm": 0.20020845532417297, "learning_rate": 3e-05, "memory(GiB)": 98.05, "train_speed(iter/s)": 0.368677, "epoch": 0.07575757575757576, "step": 15}, {"loss": 1.0381051063537599, "token_acc": 0.8534532791642484, "grad_norm": 3.012465715408325, "learning_rate": 4e-05, "memory(GiB)": 98.05, "train_speed(iter/s)": 0.388448, "epoch": 0.10101010101010101, "step": 20}, {"eval_loss": 1.6039576530456543, "eval_token_acc": 0.71751968503937, "eval_runtime": 1.3108, "eval_samples_per_second": 3.051, "eval_steps_per_second": 3.051, "epoch": 0.10101010101010101, "step": 20}, {"loss": 0.534688138961792, "token_acc": 0.8304161804745235, "grad_norm": 0.24263285100460052, "learning_rate": 5e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.355688, "epoch": 0.12626262626262627, "step": 25}, {"loss": 0.6209209442138672, "token_acc": 0.8155661353756987, "grad_norm": 0.49346253275871277, "learning_rate": 6e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.368676, "epoch": 0.15151515151515152, "step": 30}, {"loss": 0.36601178646087645, "token_acc": 0.8558015943312666, "grad_norm": 0.15310031175613403, "learning_rate": 7e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.37743, "epoch": 0.17676767676767677, "step": 35}, {"loss": 0.4429020404815674, "token_acc": 0.8457292271934922, "grad_norm": 0.11761368066072464, "learning_rate": 8e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.380325, "epoch": 0.20202020202020202, "step": 40}, {"eval_loss": 0.5790694355964661, "eval_token_acc": 0.7440944881889764, "eval_runtime": 1.3071, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "epoch": 0.20202020202020202, "step": 40}, {"loss": 0.4508810520172119, "token_acc": 0.8364097363083164, "grad_norm": 0.20896033942699432, "learning_rate": 9e-05, "memory(GiB)": 108.04, "train_speed(iter/s)": 0.367432, "epoch": 0.22727272727272727, "step": 45}, {"loss": 0.503812837600708, "token_acc": 0.8802865956811623, "grad_norm": 0.1467944234609604, "learning_rate": 0.0001, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.35784, "epoch": 0.25252525252525254, "step": 50}, {"loss": 0.45382375717163087, "token_acc": 0.8511966701352758, "grad_norm": 0.19582347571849823, "learning_rate": 9.999301905929286e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.358299, "epoch": 0.2777777777777778, "step": 55}, {"loss": 0.38322081565856936, "token_acc": 0.8599060513954131, "grad_norm": 0.22766543924808502, "learning_rate": 9.997207818651274e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.366204, "epoch": 0.30303030303030304, "step": 60}, {"eval_loss": 0.5421442985534668, "eval_token_acc": 0.7519685039370079, "eval_runtime": 1.3042, "eval_samples_per_second": 3.067, "eval_steps_per_second": 3.067, "epoch": 0.30303030303030304, "step": 60}, {"loss": 0.5506976127624512, "token_acc": 0.826218827229836, "grad_norm": 0.287585973739624, "learning_rate": 9.99371832291393e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.359199, "epoch": 0.3282828282828283, "step": 65}, {"loss": 0.41149077415466306, "token_acc": 0.8737335359675785, "grad_norm": 0.11081422865390778, "learning_rate": 9.988834393115767e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.356847, "epoch": 0.35353535353535354, "step": 70}, {"loss": 0.5796836376190185, "token_acc": 0.8277399056109072, "grad_norm": 0.1686583310365677, "learning_rate": 9.982557393033758e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.363118, "epoch": 0.3787878787878788, "step": 75}, {"loss": 0.6482028961181641, "token_acc": 0.8418734400234914, "grad_norm": 0.1717352271080017, "learning_rate": 9.974889075442521e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.364963, "epoch": 0.40404040404040403, "step": 80}, {"eval_loss": 0.5349594354629517, "eval_token_acc": 0.7490157480314961, "eval_runtime": 1.2882, "eval_samples_per_second": 3.105, "eval_steps_per_second": 3.105, "epoch": 0.40404040404040403, "step": 80}, {"loss": 0.5291558265686035, "token_acc": 0.8198504418762746, "grad_norm": 0.1399306207895279, "learning_rate": 9.965831581624871e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.360238, "epoch": 0.4292929292929293, "step": 85}, {"loss": 0.43833165168762206, "token_acc": 0.8456410256410256, "grad_norm": 0.3386712074279785, "learning_rate": 9.9553874407739e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.36297, "epoch": 0.45454545454545453, "step": 90}, {"loss": 0.3532155990600586, "token_acc": 0.8691472093894369, "grad_norm": 0.12106972932815552, "learning_rate": 9.94355956928673e-05, "memory(GiB)": 114.53, "train_speed(iter/s)": 0.36177, "epoch": 0.4797979797979798, "step": 95}, {"loss": 0.4101123809814453, "token_acc": 0.8671476137624862, "grad_norm": 0.1363830864429474, "learning_rate": 9.930351269950143e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357637, "epoch": 0.5050505050505051, "step": 100}, {"eval_loss": 0.5257760882377625, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.2863, "eval_samples_per_second": 3.11, "eval_steps_per_second": 3.11, "epoch": 0.5050505050505051, "step": 100}, {"loss": 0.5475019454956055, "token_acc": 0.8072100313479624, "grad_norm": 0.1821882128715515, "learning_rate": 9.915766231018318e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356489, "epoch": 0.5303030303030303, "step": 105}, {"loss": 0.45719470977783205, "token_acc": 0.8226950354609929, "grad_norm": 0.645659327507019, "learning_rate": 9.899808525182935e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.362336, "epoch": 0.5555555555555556, "step": 110}, {"loss": 0.44896726608276366, "token_acc": 0.8672267372842662, "grad_norm": 0.09764645248651505, "learning_rate": 9.882482608435923e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357594, "epoch": 0.5808080808080808, "step": 115}, {"loss": 0.5043072700500488, "token_acc": 0.8138049619258167, "grad_norm": 0.3066820800304413, "learning_rate": 9.863793318825186e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.357072, "epoch": 0.6060606060606061, "step": 120}, {"eval_loss": 0.519137442111969, "eval_token_acc": 0.7618110236220472, "eval_runtime": 1.2824, "eval_samples_per_second": 3.119, "eval_steps_per_second": 3.119, "epoch": 0.6060606060606061, "step": 120}, {"loss": 0.4765754222869873, "token_acc": 0.8370143478961792, "grad_norm": 0.20023687183856964, "learning_rate": 9.843745875103627e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356444, "epoch": 0.6313131313131313, "step": 125}, {"loss": 0.4863614082336426, "token_acc": 0.8408579215546865, "grad_norm": 0.21041052043437958, "learning_rate": 9.822345875271883e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356648, "epoch": 0.6565656565656566, "step": 130}, {"loss": 0.36075942516326903, "token_acc": 0.8704587642535137, "grad_norm": 0.10671097040176392, "learning_rate": 9.799599295015154e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356657, "epoch": 0.6818181818181818, "step": 135}, {"loss": 0.533957576751709, "token_acc": 0.827848754678023, "grad_norm": 0.21016854047775269, "learning_rate": 9.775512486034563e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356881, "epoch": 0.7070707070707071, "step": 140}, {"eval_loss": 0.47979286313056946, "eval_token_acc": 0.7578740157480315, "eval_runtime": 1.306, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "epoch": 0.7070707070707071, "step": 140}, {"loss": 0.36088201999664304, "token_acc": 0.857113273969766, "grad_norm": 0.21023879945278168, "learning_rate": 9.750092174273521e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.353305, "epoch": 0.7323232323232324, "step": 145}, {"loss": 0.3751711130142212, "token_acc": 0.8822255538897218, "grad_norm": 0.4503144919872284, "learning_rate": 9.723345458039594e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354538, "epoch": 0.7575757575757576, "step": 150}, {"loss": 0.41892757415771487, "token_acc": 0.8584016644229593, "grad_norm": 0.23807577788829803, "learning_rate": 9.69527980602239e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354578, "epoch": 0.7828282828282829, "step": 155}, {"loss": 0.339878511428833, "token_acc": 0.8817881788178817, "grad_norm": 0.19749346375465393, "learning_rate": 9.665903055208014e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.356324, "epoch": 0.8080808080808081, "step": 160}, {"eval_loss": 0.46302199363708496, "eval_token_acc": 0.7559055118110236, "eval_runtime": 1.2753, "eval_samples_per_second": 3.137, "eval_steps_per_second": 3.137, "epoch": 0.8080808080808081, "step": 160}, {"loss": 0.4177990436553955, "token_acc": 0.8499143497166952, "grad_norm": 0.1409613937139511, "learning_rate": 9.635223408690688e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.354285, "epoch": 0.8333333333333334, "step": 165}, {"loss": 0.45575361251831054, "token_acc": 0.8524418908331157, "grad_norm": 0.20421652495861053, "learning_rate": 9.603249433382144e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.35386, "epoch": 0.8585858585858586, "step": 170}, {"loss": 0.4119880199432373, "token_acc": 0.8568082970893275, "grad_norm": 0.2770065367221832, "learning_rate": 9.569990057619414e-05, "memory(GiB)": 129.08, "train_speed(iter/s)": 0.35561, "epoch": 0.8838383838383839, "step": 175}, {"loss": 0.41430139541625977, "token_acc": 0.8622650450165851, "grad_norm": 0.19533886015415192, "learning_rate": 9.535454568671704e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356495, "epoch": 0.9090909090909091, "step": 180}, {"eval_loss": 0.47494348883628845, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.271, "eval_samples_per_second": 3.147, "eval_steps_per_second": 3.147, "epoch": 0.9090909090909091, "step": 180}, {"loss": 0.49581570625305177, "token_acc": 0.817189460476788, "grad_norm": 0.258792519569397, "learning_rate": 9.49965261014704e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355245, "epoch": 0.9343434343434344, "step": 185}, {"loss": 0.5660634517669678, "token_acc": 0.8010047446274072, "grad_norm": 2.9851603507995605, "learning_rate": 9.462594179299406e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357613, "epoch": 0.9595959595959596, "step": 190}, {"loss": 0.5602671146392822, "token_acc": 0.8152331953920143, "grad_norm": 0.15361513197422028, "learning_rate": 9.424289624237144e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357337, "epoch": 0.9848484848484849, "step": 195}, {"loss": 0.5035863399505616, "token_acc": 0.8482374484968717, "grad_norm": 0.1556633859872818, "learning_rate": 9.384749641033359e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353905, "epoch": 1.0101010101010102, "step": 200}, {"eval_loss": 0.4684031009674072, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.2829, "eval_samples_per_second": 3.118, "eval_steps_per_second": 3.118, "epoch": 1.0101010101010102, "step": 200}, {"loss": 0.4510298728942871, "token_acc": 0.8351512146752603, "grad_norm": 0.16396360099315643, "learning_rate": 9.343985270739182e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350077, "epoch": 1.0353535353535352, "step": 205}, {"loss": 0.4226827621459961, "token_acc": 0.8615891614793116, "grad_norm": 0.20765098929405212, "learning_rate": 9.302007896300698e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350194, "epoch": 1.0606060606060606, "step": 210}, {"loss": 0.38046865463256835, "token_acc": 0.8746039856923863, "grad_norm": 0.17130540311336517, "learning_rate": 9.25882923938038e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349574, "epoch": 1.0858585858585859, "step": 215}, {"loss": 0.35848026275634765, "token_acc": 0.8796054540179866, "grad_norm": 0.31240323185920715, "learning_rate": 9.214461357083985e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350614, "epoch": 1.1111111111111112, "step": 220}, {"eval_loss": 0.4786238670349121, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.265, "eval_samples_per_second": 3.162, "eval_steps_per_second": 3.162, "epoch": 1.1111111111111112, "step": 220}, {"loss": 0.48271026611328127, "token_acc": 0.8429425702358118, "grad_norm": 0.3606383800506592, "learning_rate": 9.168916638593736e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349116, "epoch": 1.1363636363636362, "step": 225}, {"loss": 0.39199128150939944, "token_acc": 0.8686677560849746, "grad_norm": 0.2114563286304474, "learning_rate": 9.122207801708802e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.347125, "epoch": 1.1616161616161615, "step": 230}, {"loss": 0.2028397798538208, "token_acc": 0.908675799086758, "grad_norm": 0.24240955710411072, "learning_rate": 9.074347889294016e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.34928, "epoch": 1.1868686868686869, "step": 235}, {"loss": 0.44605064392089844, "token_acc": 0.853990914990266, "grad_norm": 0.4021816551685333, "learning_rate": 9.025350265637815e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350502, "epoch": 1.2121212121212122, "step": 240}, {"eval_loss": 0.477361261844635, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2844, "eval_samples_per_second": 3.114, "eval_steps_per_second": 3.114, "epoch": 1.2121212121212122, "step": 240}, {"loss": 0.28942854404449464, "token_acc": 0.8561964776215867, "grad_norm": 0.2472194880247116, "learning_rate": 8.975228612720416e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35052, "epoch": 1.2373737373737375, "step": 245}, {"loss": 0.4217637062072754, "token_acc": 0.8497478099283249, "grad_norm": 0.49620357155799866, "learning_rate": 8.923996926393305e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352588, "epoch": 1.2626262626262625, "step": 250}, {"loss": 0.3790408134460449, "token_acc": 0.859628239172237, "grad_norm": 0.2509404122829437, "learning_rate": 8.871669512471068e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.351148, "epoch": 1.2878787878787878, "step": 255}, {"loss": 0.3863436222076416, "token_acc": 0.855022437003797, "grad_norm": 0.39996537566185, "learning_rate": 8.818260982736661e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352259, "epoch": 1.3131313131313131, "step": 260}, {"eval_loss": 0.43004661798477173, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2631, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "epoch": 1.3131313131313131, "step": 260}, {"loss": 0.30726191997528074, "token_acc": 0.8950704812745016, "grad_norm": 0.13070876896381378, "learning_rate": 8.763786250861256e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.348668, "epoch": 1.3383838383838385, "step": 265}, {"loss": 0.28099467754364016, "token_acc": 0.8945386064030132, "grad_norm": 0.29351168870925903, "learning_rate": 8.708260528239788e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349732, "epoch": 1.3636363636363638, "step": 270}, {"loss": 0.2923673152923584, "token_acc": 0.8954248366013072, "grad_norm": 0.29749786853790283, "learning_rate": 8.651699319743347e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.349425, "epoch": 1.3888888888888888, "step": 275}, {"loss": 0.42062225341796877, "token_acc": 0.8647863247863248, "grad_norm": 0.5873435139656067, "learning_rate": 8.594118419389647e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350425, "epoch": 1.4141414141414141, "step": 280}, {"eval_loss": 0.4043884575366974, "eval_token_acc": 0.7726377952755905, "eval_runtime": 1.2688, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "epoch": 1.4141414141414141, "step": 280}, {"loss": 0.23297641277313233, "token_acc": 0.8899554336647241, "grad_norm": 0.5294097065925598, "learning_rate": 8.535533905932738e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350232, "epoch": 1.4393939393939394, "step": 285}, {"loss": 0.36497814655303956, "token_acc": 0.8623964437260052, "grad_norm": 0.4506111741065979, "learning_rate": 8.475962138373213e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.351651, "epoch": 1.4646464646464645, "step": 290}, {"loss": 0.3962693214416504, "token_acc": 0.85288089273514, "grad_norm": 0.30804958939552307, "learning_rate": 8.415419751390155e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353152, "epoch": 1.4898989898989898, "step": 295}, {"loss": 0.3206871509552002, "token_acc": 0.8861493836113126, "grad_norm": 1.6068087816238403, "learning_rate": 8.353923650696118e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353426, "epoch": 1.5151515151515151, "step": 300}, {"eval_loss": 0.42214763164520264, "eval_token_acc": 0.7775590551181102, "eval_runtime": 1.2774, "eval_samples_per_second": 3.131, "eval_steps_per_second": 3.131, "epoch": 1.5151515151515151, "step": 300}, {"loss": 0.3652678966522217, "token_acc": 0.8686586614539701, "grad_norm": 0.3600684404373169, "learning_rate": 8.291491008316409e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.350995, "epoch": 1.5404040404040404, "step": 305}, {"loss": 0.30328705310821535, "token_acc": 0.8997613365155132, "grad_norm": 0.33816397190093994, "learning_rate": 8.228139257794012e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352277, "epoch": 1.5656565656565657, "step": 310}, {"loss": 0.32169332504272463, "token_acc": 0.8734251968503937, "grad_norm": 0.849923849105835, "learning_rate": 8.163886089321493e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353612, "epoch": 1.5909090909090908, "step": 315}, {"loss": 0.33423264026641847, "token_acc": 0.9003220364974697, "grad_norm": 0.37628281116485596, "learning_rate": 8.098749444801224e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354058, "epoch": 1.6161616161616161, "step": 320}, {"eval_loss": 0.4008274972438812, "eval_token_acc": 0.7726377952755905, "eval_runtime": 1.2671, "eval_samples_per_second": 3.157, "eval_steps_per_second": 3.157, "epoch": 1.6161616161616161, "step": 320}, {"loss": 0.35271801948547366, "token_acc": 0.8523900054318305, "grad_norm": 0.37696319818496704, "learning_rate": 8.032747512835337e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353524, "epoch": 1.6414141414141414, "step": 325}, {"loss": 0.38188652992248534, "token_acc": 0.8839562254800744, "grad_norm": 0.574674129486084, "learning_rate": 7.965898723646776e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35462, "epoch": 1.6666666666666665, "step": 330}, {"loss": 0.38109359741210935, "token_acc": 0.8710840033268644, "grad_norm": 0.31084558367729187, "learning_rate": 7.898221743932888e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354627, "epoch": 1.691919191919192, "step": 335}, {"loss": 0.26326937675476075, "token_acc": 0.9064136125654451, "grad_norm": 0.7837309837341309, "learning_rate": 7.829735471652978e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355662, "epoch": 1.7171717171717171, "step": 340}, {"eval_loss": 0.3601945638656616, "eval_token_acc": 0.7726377952755905, "eval_runtime": 1.2553, "eval_samples_per_second": 3.186, "eval_steps_per_second": 3.186, "epoch": 1.7171717171717171, "step": 340}, {"loss": 0.2819732904434204, "token_acc": 0.894602905312268, "grad_norm": 0.326943576335907, "learning_rate": 7.760459030751284e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35432, "epoch": 1.7424242424242424, "step": 345}, {"loss": 0.16738426685333252, "token_acc": 0.9215094339622641, "grad_norm": 1.0278775691986084, "learning_rate": 7.690411765816864e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355229, "epoch": 1.7676767676767677, "step": 350}, {"loss": 0.4713289260864258, "token_acc": 0.8477546549835706, "grad_norm": 0.6167912483215332, "learning_rate": 7.619613236681843e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355466, "epoch": 1.7929292929292928, "step": 355}, {"loss": 0.259800124168396, "token_acc": 0.8968109615617801, "grad_norm": 0.26643773913383484, "learning_rate": 7.548083212959588e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355753, "epoch": 1.8181818181818183, "step": 360}, {"eval_loss": 0.37819036841392517, "eval_token_acc": 0.7785433070866141, "eval_runtime": 1.3003, "eval_samples_per_second": 3.076, "eval_steps_per_second": 3.076, "epoch": 1.8181818181818183, "step": 360}, {"loss": 0.3847909212112427, "token_acc": 0.8547150949683439, "grad_norm": 0.709635853767395, "learning_rate": 7.475841668524268e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355576, "epoch": 1.8434343434343434, "step": 365}, {"loss": 0.35011069774627684, "token_acc": 0.8763573543928924, "grad_norm": 0.28540483117103577, "learning_rate": 7.402908775933419e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355573, "epoch": 1.8686868686868687, "step": 370}, {"loss": 0.4088496208190918, "token_acc": 0.8652825291966497, "grad_norm": 0.4313144385814667, "learning_rate": 7.329304900794991e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355232, "epoch": 1.893939393939394, "step": 375}, {"loss": 0.3548138618469238, "token_acc": 0.8879879054425509, "grad_norm": 0.6075259447097778, "learning_rate": 7.255050596080509e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35565, "epoch": 1.9191919191919191, "step": 380}, {"eval_loss": 0.373136043548584, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2667, "eval_samples_per_second": 3.158, "eval_steps_per_second": 3.158, "epoch": 1.9191919191919191, "step": 380}, {"loss": 0.3757177352905273, "token_acc": 0.882828778036524, "grad_norm": 0.1639028936624527, "learning_rate": 7.180166596385914e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354333, "epoch": 1.9444444444444444, "step": 385}, {"loss": 0.25887558460235593, "token_acc": 0.8980108083247097, "grad_norm": 0.32813289761543274, "learning_rate": 7.104673812141675e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353935, "epoch": 1.9696969696969697, "step": 390}, {"loss": 0.254361891746521, "token_acc": 0.9006518318723309, "grad_norm": 0.3131479322910309, "learning_rate": 7.02859332377382e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354813, "epoch": 1.9949494949494948, "step": 395}, {"loss": 0.22614221572875975, "token_acc": 0.9400690304361469, "grad_norm": 0.5802826881408691, "learning_rate": 6.951946375817474e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356043, "epoch": 2.0202020202020203, "step": 400}, {"eval_loss": 0.3536229729652405, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2885, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "epoch": 2.0202020202020203, "step": 400}, {"loss": 0.15614408254623413, "token_acc": 0.9098951953178168, "grad_norm": 0.6191949844360352, "learning_rate": 6.874754370984606e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355365, "epoch": 2.0454545454545454, "step": 405}, {"loss": 0.2733434200286865, "token_acc": 0.9188966652943599, "grad_norm": 0.7725083231925964, "learning_rate": 6.797038864187564e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355292, "epoch": 2.0707070707070705, "step": 410}, {"loss": 0.1949324369430542, "token_acc": 0.9226980728051392, "grad_norm": 0.3713182210922241, "learning_rate": 6.718821556520151e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355986, "epoch": 2.095959595959596, "step": 415}, {"loss": 0.09827777743339539, "token_acc": 0.9717413441955194, "grad_norm": 0.2845621109008789, "learning_rate": 6.640124289197845e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356829, "epoch": 2.121212121212121, "step": 420}, {"eval_loss": 0.3582577109336853, "eval_token_acc": 0.7667322834645669, "eval_runtime": 1.2794, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "epoch": 2.121212121212121, "step": 420}, {"loss": 0.19055347442626952, "token_acc": 0.8919627256843331, "grad_norm": 0.607758641242981, "learning_rate": 6.560969037458933e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356579, "epoch": 2.1464646464646466, "step": 425}, {"loss": 0.1357766032218933, "token_acc": 0.9581637268204433, "grad_norm": 0.4930749237537384, "learning_rate": 6.481377904428171e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356034, "epoch": 2.1717171717171717, "step": 430}, {"loss": 0.18788766860961914, "token_acc": 0.9421218961625282, "grad_norm": 0.2689533531665802, "learning_rate": 6.401373114944781e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355275, "epoch": 2.196969696969697, "step": 435}, {"loss": 0.2125793695449829, "token_acc": 0.9272427983539094, "grad_norm": 0.9979881644248962, "learning_rate": 6.320977009356431e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355873, "epoch": 2.2222222222222223, "step": 440}, {"eval_loss": 0.3688502013683319, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2802, "eval_samples_per_second": 3.124, "eval_steps_per_second": 3.124, "epoch": 2.2222222222222223, "step": 440}, {"loss": 0.11968926191329957, "token_acc": 0.9304094308530866, "grad_norm": 0.37610238790512085, "learning_rate": 6.240212037280966e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355188, "epoch": 2.2474747474747474, "step": 445}, {"loss": 0.26689648628234863, "token_acc": 0.9058954807513483, "grad_norm": 0.7159104943275452, "learning_rate": 6.159100751337642e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355771, "epoch": 2.2727272727272725, "step": 450}, {"loss": 0.1854721188545227, "token_acc": 0.9324742268041237, "grad_norm": 0.29641231894493103, "learning_rate": 6.077665800849568e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355991, "epoch": 2.297979797979798, "step": 455}, {"loss": 0.193935763835907, "token_acc": 0.9377682403433476, "grad_norm": 0.25186142325401306, "learning_rate": 5.99592992551918e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356327, "epoch": 2.323232323232323, "step": 460}, {"eval_loss": 0.3633354604244232, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2615, "eval_samples_per_second": 3.171, "eval_steps_per_second": 3.171, "epoch": 2.323232323232323, "step": 460}, {"loss": 0.176645827293396, "token_acc": 0.8921251348435815, "grad_norm": 0.49181655049324036, "learning_rate": 5.913915949078452e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356557, "epoch": 2.3484848484848486, "step": 465}, {"loss": 0.13740575313568115, "token_acc": 0.9428538968416269, "grad_norm": 0.5645484328269958, "learning_rate": 5.831646772915651e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356147, "epoch": 2.3737373737373737, "step": 470}, {"loss": 0.21261224746704102, "token_acc": 0.9206197398622801, "grad_norm": 1.3345602750778198, "learning_rate": 5.749145369680407e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356639, "epoch": 2.398989898989899, "step": 475}, {"loss": 0.2075648546218872, "token_acc": 0.9252907219944784, "grad_norm": 0.3252560496330261, "learning_rate": 5.666434776868895e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355528, "epoch": 2.4242424242424243, "step": 480}, {"eval_loss": 0.36280357837677, "eval_token_acc": 0.7667322834645669, "eval_runtime": 1.2645, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.4242424242424243, "step": 480}, {"loss": 0.13927946090698243, "token_acc": 0.8984392671341326, "grad_norm": 0.6244832277297974, "learning_rate": 5.583538090390882e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355772, "epoch": 2.4494949494949494, "step": 485}, {"loss": 0.28371801376342776, "token_acc": 0.8928283642224013, "grad_norm": 0.5457295179367065, "learning_rate": 5.5004784581204927e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355994, "epoch": 2.474747474747475, "step": 490}, {"loss": 0.16324831247329713, "token_acc": 0.9382183908045977, "grad_norm": 0.26068228483200073, "learning_rate": 5.41727907343245e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356737, "epoch": 2.5, "step": 495}, {"loss": 0.23240807056427001, "token_acc": 0.8918985471558729, "grad_norm": 0.4725530445575714, "learning_rate": 5.3339631687256084e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356548, "epoch": 2.525252525252525, "step": 500}, {"eval_loss": 0.36516159772872925, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2784, "eval_samples_per_second": 3.129, "eval_steps_per_second": 3.129, "epoch": 2.525252525252525, "step": 500}, {"loss": 0.15861610174179078, "token_acc": 0.9060025910464949, "grad_norm": 0.5990637540817261, "learning_rate": 5.250554008935596e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356161, "epoch": 2.5505050505050506, "step": 505}, {"loss": 0.16548032760620118, "token_acc": 0.9370354175776126, "grad_norm": 0.7070275545120239, "learning_rate": 5.167074885038373e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357031, "epoch": 2.5757575757575757, "step": 510}, {"loss": 0.18845115900039672, "token_acc": 0.9271042471042471, "grad_norm": 0.30997011065483093, "learning_rate": 5.0835491075465045e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357488, "epoch": 2.601010101010101, "step": 515}, {"loss": 0.19470884799957275, "token_acc": 0.9188622362039586, "grad_norm": 0.34516477584838867, "learning_rate": 5e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357567, "epoch": 2.6262626262626263, "step": 520}, {"eval_loss": 0.35303670167922974, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.2716, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "epoch": 2.6262626262626263, "step": 520}, {"loss": 0.16326183080673218, "token_acc": 0.9170015455950541, "grad_norm": 0.400846928358078, "learning_rate": 4.916450892453495e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.357449, "epoch": 2.6515151515151514, "step": 525}, {"loss": 0.2275157690048218, "token_acc": 0.911013136584488, "grad_norm": 0.3526351749897003, "learning_rate": 4.832925114961629e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356693, "epoch": 2.676767676767677, "step": 530}, {"loss": 0.16660224199295043, "token_acc": 0.9465856041689285, "grad_norm": 0.7750332355499268, "learning_rate": 4.749445991064404e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.356093, "epoch": 2.702020202020202, "step": 535}, {"loss": 0.29327480792999266, "token_acc": 0.893456980937661, "grad_norm": 1.008236289024353, "learning_rate": 4.666036831274392e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355675, "epoch": 2.7272727272727275, "step": 540}, {"eval_loss": 0.34143465757369995, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.2634, "eval_samples_per_second": 3.166, "eval_steps_per_second": 3.166, "epoch": 2.7272727272727275, "step": 540}, {"loss": 0.221860933303833, "token_acc": 0.8882824294507026, "grad_norm": 0.5062530040740967, "learning_rate": 4.582720926567552e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355094, "epoch": 2.7525252525252526, "step": 545}, {"loss": 0.171502685546875, "token_acc": 0.9311728853872454, "grad_norm": 0.4163118004798889, "learning_rate": 4.4995215418795085e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355204, "epoch": 2.7777777777777777, "step": 550}, {"loss": 0.18130356073379517, "token_acc": 0.9372671732975711, "grad_norm": 0.6872218251228333, "learning_rate": 4.416461909609119e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35566, "epoch": 2.8030303030303028, "step": 555}, {"loss": 0.15754028558731079, "token_acc": 0.9293805736322005, "grad_norm": 0.2589365839958191, "learning_rate": 4.333565223131107e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355647, "epoch": 2.8282828282828283, "step": 560}, {"eval_loss": 0.328652024269104, "eval_token_acc": 0.764763779527559, "eval_runtime": 1.2642, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 2.8282828282828283, "step": 560}, {"loss": 0.2055502414703369, "token_acc": 0.9031949899161448, "grad_norm": 0.6874573230743408, "learning_rate": 4.250854630319593e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355089, "epoch": 2.8535353535353534, "step": 565}, {"loss": 0.25478286743164064, "token_acc": 0.903887358432813, "grad_norm": 0.6083143949508667, "learning_rate": 4.1683532270843504e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354778, "epoch": 2.878787878787879, "step": 570}, {"loss": 0.17642589807510375, "token_acc": 0.9331498230436492, "grad_norm": 0.9529440999031067, "learning_rate": 4.0860840509215496e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354847, "epoch": 2.904040404040404, "step": 575}, {"loss": 0.24578819274902344, "token_acc": 0.8957880166106387, "grad_norm": 0.17912031710147858, "learning_rate": 4.0040700744808204e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355458, "epoch": 2.929292929292929, "step": 580}, {"eval_loss": 0.32831043004989624, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2645, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "epoch": 2.929292929292929, "step": 580}, {"loss": 0.21303670406341552, "token_acc": 0.8920780711825488, "grad_norm": 0.7658194303512573, "learning_rate": 3.922334199150432e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355426, "epoch": 2.9545454545454546, "step": 585}, {"loss": 0.19972538948059082, "token_acc": 0.9341101694915255, "grad_norm": 0.7090197801589966, "learning_rate": 3.840899248662358e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354892, "epoch": 2.9797979797979797, "step": 590}, {"loss": 0.2259267807006836, "token_acc": 0.9335699797160243, "grad_norm": 0.26023608446121216, "learning_rate": 3.7597879627190334e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354377, "epoch": 3.005050505050505, "step": 595}, {"loss": 0.13799512386322021, "token_acc": 0.957043945174509, "grad_norm": 0.8862583041191101, "learning_rate": 3.6790229906435705e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354572, "epoch": 3.0303030303030303, "step": 600}, {"eval_loss": 0.30323198437690735, "eval_token_acc": 0.7706692913385826, "eval_runtime": 1.2806, "eval_samples_per_second": 3.123, "eval_steps_per_second": 3.123, "epoch": 3.0303030303030303, "step": 600}, {"loss": 0.08972094058990479, "token_acc": 0.9397115384615384, "grad_norm": 0.5918054580688477, "learning_rate": 3.598626885055219e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.3534, "epoch": 3.0555555555555554, "step": 605}, {"loss": 0.04932542443275452, "token_acc": 0.975853123129116, "grad_norm": 0.7078537940979004, "learning_rate": 3.5186220955718306e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354057, "epoch": 3.080808080808081, "step": 610}, {"loss": 0.03949523568153381, "token_acc": 0.9892593421347058, "grad_norm": 0.13521677255630493, "learning_rate": 3.4390309625410686e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354635, "epoch": 3.106060606060606, "step": 615}, {"loss": 0.08747856616973877, "token_acc": 0.9691932624113475, "grad_norm": 0.430328905582428, "learning_rate": 3.3598757108021546e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354612, "epoch": 3.1313131313131315, "step": 620}, {"eval_loss": 0.306226909160614, "eval_token_acc": 0.7696850393700787, "eval_runtime": 1.3165, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.1313131313131315, "step": 620}, {"loss": 0.09797981977462769, "token_acc": 0.9492996646281318, "grad_norm": 0.376487672328949, "learning_rate": 3.281178443479852e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353631, "epoch": 3.1565656565656566, "step": 625}, {"loss": 0.07529096603393555, "token_acc": 0.9698409419541417, "grad_norm": 0.7452987432479858, "learning_rate": 3.202961135812437e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354143, "epoch": 3.1818181818181817, "step": 630}, {"loss": 0.1325251579284668, "token_acc": 0.9434507276969225, "grad_norm": 0.5807965993881226, "learning_rate": 3.1252456290153954e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354003, "epoch": 3.207070707070707, "step": 635}, {"loss": 0.11141908168792725, "token_acc": 0.9503339290753456, "grad_norm": 0.4517095685005188, "learning_rate": 3.0480536241825263e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353366, "epoch": 3.2323232323232323, "step": 640}, {"eval_loss": 0.30879175662994385, "eval_token_acc": 0.765748031496063, "eval_runtime": 1.3167, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "epoch": 3.2323232323232323, "step": 640}, {"loss": 0.08485085368156434, "token_acc": 0.9452848128619586, "grad_norm": 0.4209767282009125, "learning_rate": 2.9714066762261823e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352332, "epoch": 3.257575757575758, "step": 645}, {"loss": 0.12707052230834961, "token_acc": 0.9538642869169894, "grad_norm": 0.32493889331817627, "learning_rate": 2.895326187858326e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352802, "epoch": 3.282828282828283, "step": 650}, {"loss": 0.09735980033874511, "token_acc": 0.9682322541419669, "grad_norm": 0.4984124004840851, "learning_rate": 2.8198334036140874e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.352963, "epoch": 3.308080808080808, "step": 655}, {"loss": 0.09166445732116699, "token_acc": 0.9631701631701631, "grad_norm": 0.2004961520433426, "learning_rate": 2.74494940391949e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353022, "epoch": 3.3333333333333335, "step": 660}, {"eval_loss": 0.3101058900356293, "eval_token_acc": 0.7627952755905512, "eval_runtime": 1.2782, "eval_samples_per_second": 3.129, "eval_steps_per_second": 3.129, "epoch": 3.3333333333333335, "step": 660}, {"loss": 0.09054631590843201, "token_acc": 0.9231199850718418, "grad_norm": 0.8963623046875, "learning_rate": 2.6706950992050094e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353076, "epoch": 3.3585858585858586, "step": 665}, {"loss": 0.10762099027633668, "token_acc": 0.96045197740113, "grad_norm": 0.8532351851463318, "learning_rate": 2.5970912240665813e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353315, "epoch": 3.3838383838383836, "step": 670}, {"loss": 0.07633500695228576, "token_acc": 0.9642299010244835, "grad_norm": 0.36199483275413513, "learning_rate": 2.5241583314757327e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353653, "epoch": 3.409090909090909, "step": 675}, {"loss": 0.1227030634880066, "token_acc": 0.942090395480226, "grad_norm": 0.6680567860603333, "learning_rate": 2.4519167870404125e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354043, "epoch": 3.4343434343434343, "step": 680}, {"eval_loss": 0.3102591335773468, "eval_token_acc": 0.7578740157480315, "eval_runtime": 1.2774, "eval_samples_per_second": 3.131, "eval_steps_per_second": 3.131, "epoch": 3.4343434343434343, "step": 680}, {"loss": 0.11645561456680298, "token_acc": 0.9112238427393786, "grad_norm": 0.2870488464832306, "learning_rate": 2.3803867633181574e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353305, "epoch": 3.45959595959596, "step": 685}, {"loss": 0.13571174144744874, "token_acc": 0.9530231512699483, "grad_norm": 0.666077196598053, "learning_rate": 2.3095882341831372e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353078, "epoch": 3.484848484848485, "step": 690}, {"loss": 0.11649401187896728, "token_acc": 0.9593987292732062, "grad_norm": 0.39280450344085693, "learning_rate": 2.2395409692487175e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353484, "epoch": 3.51010101010101, "step": 695}, {"loss": 0.038886070251464844, "token_acc": 0.9857142857142858, "grad_norm": 0.24035465717315674, "learning_rate": 2.1702645283470236e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354064, "epoch": 3.5353535353535355, "step": 700}, {"eval_loss": 0.3143807351589203, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.2708, "eval_samples_per_second": 3.148, "eval_steps_per_second": 3.148, "epoch": 3.5353535353535355, "step": 700}, {"loss": 0.06877344250679016, "token_acc": 0.9332363107149354, "grad_norm": 0.3898966610431671, "learning_rate": 2.1017782560671123e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354088, "epoch": 3.5606060606060606, "step": 705}, {"loss": 0.0904355764389038, "token_acc": 0.9646369533375183, "grad_norm": 0.577021062374115, "learning_rate": 2.0341012763532243e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353937, "epoch": 3.5858585858585856, "step": 710}, {"loss": 0.008103035390377045, "token_acc": 0.9974391805377721, "grad_norm": 0.015718597918748856, "learning_rate": 1.967252487164663e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354851, "epoch": 3.611111111111111, "step": 715}, {"loss": 0.0557898998260498, "token_acc": 0.9812704501861672, "grad_norm": 0.6149921417236328, "learning_rate": 1.9012505551987765e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354676, "epoch": 3.6363636363636362, "step": 720}, {"eval_loss": 0.31738603115081787, "eval_token_acc": 0.7618110236220472, "eval_runtime": 1.2704, "eval_samples_per_second": 3.149, "eval_steps_per_second": 3.149, "epoch": 3.6363636363636362, "step": 720}, {"loss": 0.1061089038848877, "token_acc": 0.9314117647058824, "grad_norm": 0.5522142052650452, "learning_rate": 1.836113910678507e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354276, "epoch": 3.6616161616161618, "step": 725}, {"loss": 0.11565899848937988, "token_acc": 0.954354001371339, "grad_norm": 0.3203338086605072, "learning_rate": 1.771860742205988e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35397, "epoch": 3.686868686868687, "step": 730}, {"loss": 0.09632692337036133, "token_acc": 0.9580305687797545, "grad_norm": 0.5480756163597107, "learning_rate": 1.7085089916835923e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353862, "epoch": 3.712121212121212, "step": 735}, {"loss": 0.12017930746078491, "token_acc": 0.9624349836255057, "grad_norm": 0.710368812084198, "learning_rate": 1.646076349303884e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354334, "epoch": 3.7373737373737375, "step": 740}, {"eval_loss": 0.3139352798461914, "eval_token_acc": 0.7598425196850394, "eval_runtime": 1.2642, "eval_samples_per_second": 3.164, "eval_steps_per_second": 3.164, "epoch": 3.7373737373737375, "step": 740}, {"loss": 0.06079275012016296, "token_acc": 0.9102909482758621, "grad_norm": 0.3609558939933777, "learning_rate": 1.584580248609846e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354549, "epoch": 3.7626262626262625, "step": 745}, {"loss": 0.03985466659069061, "token_acc": 0.9833689712520789, "grad_norm": 0.7200530767440796, "learning_rate": 1.5240378616267886e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355155, "epoch": 3.787878787878788, "step": 750}, {"loss": 0.18646769523620604, "token_acc": 0.922202486678508, "grad_norm": 0.47436633706092834, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354059, "epoch": 3.813131313131313, "step": 755}, {"loss": 0.017752669751644135, "token_acc": 0.9946977730646872, "grad_norm": 0.4159301817417145, "learning_rate": 1.4058815806103542e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354791, "epoch": 3.8383838383838382, "step": 760}, {"eval_loss": 0.31617671251296997, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.2713, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "epoch": 3.8383838383838382, "step": 760}, {"loss": 0.11076927185058594, "token_acc": 0.9201399452388196, "grad_norm": 0.630095362663269, "learning_rate": 1.3483006802566544e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354628, "epoch": 3.8636363636363638, "step": 765}, {"loss": 0.06400647759437561, "token_acc": 0.9771947032859245, "grad_norm": 0.17331808805465698, "learning_rate": 1.2917394717602121e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354776, "epoch": 3.888888888888889, "step": 770}, {"loss": 0.09050332307815552, "token_acc": 0.9741574731751549, "grad_norm": 0.5865471959114075, "learning_rate": 1.2362137491387432e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354814, "epoch": 3.9141414141414144, "step": 775}, {"loss": 0.09545594453811646, "token_acc": 0.9562637969094923, "grad_norm": 0.4024655818939209, "learning_rate": 1.1817390172633403e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354768, "epoch": 3.9393939393939394, "step": 780}, {"eval_loss": 0.32290545105934143, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.2807, "eval_samples_per_second": 3.123, "eval_steps_per_second": 3.123, "epoch": 3.9393939393939394, "step": 780}, {"loss": 0.0838412582874298, "token_acc": 0.9169477234401349, "grad_norm": 2.198448419570923, "learning_rate": 1.1283304875289336e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354661, "epoch": 3.9646464646464645, "step": 785}, {"loss": 0.09766408801078796, "token_acc": 0.9677739216658403, "grad_norm": 0.8399145603179932, "learning_rate": 1.0760030736066951e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355019, "epoch": 3.98989898989899, "step": 790}, {"loss": 0.08901907801628113, "token_acc": 0.9699577530902832, "grad_norm": 0.5167694091796875, "learning_rate": 1.024771387279585e-05, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355271, "epoch": 4.015151515151516, "step": 795}, {"loss": 0.03536704182624817, "token_acc": 0.9874716779954685, "grad_norm": 0.29580003023147583, "learning_rate": 9.746497343621857e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355303, "epoch": 4.040404040404041, "step": 800}, {"eval_loss": 0.32690542936325073, "eval_token_acc": 0.7588582677165354, "eval_runtime": 1.2877, "eval_samples_per_second": 3.106, "eval_steps_per_second": 3.106, "epoch": 4.040404040404041, "step": 800}, {"loss": 0.06580750346183777, "token_acc": 0.9491756538985993, "grad_norm": 0.6783302426338196, "learning_rate": 9.256521107059834e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354851, "epoch": 4.065656565656566, "step": 805}, {"loss": 0.018977776169776917, "token_acc": 0.9939244351623315, "grad_norm": 0.02713463269174099, "learning_rate": 8.777921982911996e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355276, "epoch": 4.090909090909091, "step": 810}, {"loss": 0.04440165162086487, "token_acc": 0.9830610103432769, "grad_norm": 0.47375404834747314, "learning_rate": 8.310833614062651e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35558, "epoch": 4.116161616161616, "step": 815}, {"loss": 0.0346536248922348, "token_acc": 0.9884083816317432, "grad_norm": 0.2699472904205322, "learning_rate": 7.85538642916015e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355478, "epoch": 4.141414141414141, "step": 820}, {"eval_loss": 0.33224982023239136, "eval_token_acc": 0.7578740157480315, "eval_runtime": 1.266, "eval_samples_per_second": 3.16, "eval_steps_per_second": 3.16, "epoch": 4.141414141414141, "step": 820}, {"loss": 0.060616308450698854, "token_acc": 0.9562847370671227, "grad_norm": 0.46439409255981445, "learning_rate": 7.4117076061961885e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354922, "epoch": 4.166666666666667, "step": 825}, {"loss": 0.07455227375030518, "token_acc": 0.9444043321299639, "grad_norm": 0.32245391607284546, "learning_rate": 6.979921036993042e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354969, "epoch": 4.191919191919192, "step": 830}, {"loss": 0.09966359734535217, "token_acc": 0.9565217391304348, "grad_norm": 0.6579491496086121, "learning_rate": 6.5601472926081766e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355055, "epoch": 4.217171717171717, "step": 835}, {"loss": 0.07703586220741272, "token_acc": 0.9676385773790451, "grad_norm": 0.44027379155158997, "learning_rate": 6.152503589666425e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354947, "epoch": 4.242424242424242, "step": 840}, {"eval_loss": 0.3352506160736084, "eval_token_acc": 0.7568897637795275, "eval_runtime": 1.2662, "eval_samples_per_second": 3.159, "eval_steps_per_second": 3.159, "epoch": 4.242424242424242, "step": 840}, {"loss": 0.1429282546043396, "token_acc": 0.922793074084587, "grad_norm": 0.8359324932098389, "learning_rate": 5.757103757628573e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354142, "epoch": 4.267676767676767, "step": 845}, {"loss": 0.10208557844161988, "token_acc": 0.961126817447496, "grad_norm": 0.27999410033226013, "learning_rate": 5.374058207005944e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353931, "epoch": 4.292929292929293, "step": 850}, {"loss": 0.035950151085853574, "token_acc": 0.9865144100054377, "grad_norm": 0.39005765318870544, "learning_rate": 5.0034738985296095e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353884, "epoch": 4.318181818181818, "step": 855}, {"loss": 0.034918776154518126, "token_acc": 0.9884200718754159, "grad_norm": 0.254375696182251, "learning_rate": 4.645454313282965e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353895, "epoch": 4.343434343434343, "step": 860}, {"eval_loss": 0.3357633650302887, "eval_token_acc": 0.7539370078740157, "eval_runtime": 1.2612, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "epoch": 4.343434343434343, "step": 860}, {"loss": 0.04228464365005493, "token_acc": 0.954431050470261, "grad_norm": 0.5025836229324341, "learning_rate": 4.3000994238058644e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.353642, "epoch": 4.3686868686868685, "step": 865}, {"loss": 0.03816230297088623, "token_acc": 0.9892336922102597, "grad_norm": 0.6179090142250061, "learning_rate": 3.967505666178556e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35391, "epoch": 4.393939393939394, "step": 870}, {"loss": 0.023942221701145173, "token_acc": 0.9865377322715206, "grad_norm": 0.42816147208213806, "learning_rate": 3.647765913093132e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354293, "epoch": 4.41919191919192, "step": 875}, {"loss": 0.05464286208152771, "token_acc": 0.9736096615476368, "grad_norm": 0.40714192390441895, "learning_rate": 3.340969447919873e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354416, "epoch": 4.444444444444445, "step": 880}, {"eval_loss": 0.3359661400318146, "eval_token_acc": 0.7559055118110236, "eval_runtime": 1.3133, "eval_samples_per_second": 3.046, "eval_steps_per_second": 3.046, "epoch": 4.444444444444445, "step": 880}, {"loss": 0.03318539261817932, "token_acc": 0.9598147220831247, "grad_norm": 0.34820908308029175, "learning_rate": 3.0472019397761064e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35415, "epoch": 4.46969696969697, "step": 885}, {"loss": 0.05409420132637024, "token_acc": 0.9825274278748476, "grad_norm": 0.42267289757728577, "learning_rate": 2.7665454196040664e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354092, "epoch": 4.494949494949495, "step": 890}, {"loss": 0.024935531616210937, "token_acc": 0.9860741347532255, "grad_norm": 0.4588811993598938, "learning_rate": 2.4990782572647975e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354453, "epoch": 4.52020202020202, "step": 895}, {"loss": 0.11395398378372193, "token_acc": 0.9591113972955569, "grad_norm": 0.6112403869628906, "learning_rate": 2.2448751396543787e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354293, "epoch": 4.545454545454545, "step": 900}, {"eval_loss": 0.3369322121143341, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.3215, "eval_samples_per_second": 3.027, "eval_steps_per_second": 3.027, "epoch": 4.545454545454545, "step": 900}, {"loss": 0.00986407846212387, "token_acc": 0.9589934762348555, "grad_norm": 0.2242911458015442, "learning_rate": 2.004007049848461e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354216, "epoch": 4.570707070707071, "step": 905}, {"loss": 0.015086154639720916, "token_acc": 0.9951715374841169, "grad_norm": 0.14168986678123474, "learning_rate": 1.7765412472811771e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354789, "epoch": 4.595959595959596, "step": 910}, {"loss": 0.0722315788269043, "token_acc": 0.9734163755126842, "grad_norm": 0.027493759989738464, "learning_rate": 1.5625412489637337e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354981, "epoch": 4.621212121212121, "step": 915}, {"loss": 0.029132437705993653, "token_acc": 0.9898477157360406, "grad_norm": 0.2218606173992157, "learning_rate": 1.3620668117481472e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355396, "epoch": 4.646464646464646, "step": 920}, {"eval_loss": 0.33511751890182495, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.2779, "eval_samples_per_second": 3.13, "eval_steps_per_second": 3.13, "epoch": 4.646464646464646, "step": 920}, {"loss": 0.014484831690788269, "token_acc": 0.9550669216061185, "grad_norm": 0.5357446074485779, "learning_rate": 1.1751739156407649e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355269, "epoch": 4.671717171717171, "step": 925}, {"loss": 0.09904469847679138, "token_acc": 0.9562597200622084, "grad_norm": 0.034110553562641144, "learning_rate": 1.0019147481706625e-06, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355569, "epoch": 4.696969696969697, "step": 930}, {"loss": 0.09365715980529785, "token_acc": 0.9637131611788423, "grad_norm": 0.46665722131729126, "learning_rate": 8.423376898168245e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355182, "epoch": 4.722222222222222, "step": 935}, {"loss": 0.04413898587226868, "token_acc": 0.9745094750964279, "grad_norm": 0.08903438597917557, "learning_rate": 6.964873004985717e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355403, "epoch": 4.747474747474747, "step": 940}, {"eval_loss": 0.33372214436531067, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.3056, "eval_samples_per_second": 3.064, "eval_steps_per_second": 3.064, "epoch": 4.747474747474747, "step": 940}, {"loss": 0.039725151658058164, "token_acc": 0.9612076380526406, "grad_norm": 0.2658371031284332, "learning_rate": 5.644043071326932e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354412, "epoch": 4.7727272727272725, "step": 945}, {"loss": 0.03821060359477997, "token_acc": 0.975949000289771, "grad_norm": 0.03347943350672722, "learning_rate": 4.461255922609986e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354365, "epoch": 4.797979797979798, "step": 950}, {"loss": 0.01510193943977356, "token_acc": 0.995788365359305, "grad_norm": 0.37791627645492554, "learning_rate": 3.416841837512952e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354476, "epoch": 4.8232323232323235, "step": 955}, {"loss": 0.08025823831558228, "token_acc": 0.9634054135793134, "grad_norm": 0.6039087176322937, "learning_rate": 2.511092455747932e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354659, "epoch": 4.848484848484849, "step": 960}, {"eval_loss": 0.33536115288734436, "eval_token_acc": 0.7559055118110236, "eval_runtime": 1.2813, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "epoch": 4.848484848484849, "step": 960}, {"loss": 0.04071699380874634, "token_acc": 0.9473123191716156, "grad_norm": 0.24173401296138763, "learning_rate": 1.7442606966242004e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354442, "epoch": 4.873737373737374, "step": 965}, {"loss": 0.04371882379055023, "token_acc": 0.9862877911779283, "grad_norm": 0.4899163246154785, "learning_rate": 1.1165606884234181e-07, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.35482, "epoch": 4.898989898989899, "step": 970}, {"loss": 0.02706504464149475, "token_acc": 0.9944558521560575, "grad_norm": 0.2417013943195343, "learning_rate": 6.281677086071303e-08, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355295, "epoch": 4.924242424242424, "step": 975}, {"loss": 0.069762122631073, "token_acc": 0.9661454379839077, "grad_norm": 0.11299926042556763, "learning_rate": 2.792181348726941e-08, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355325, "epoch": 4.94949494949495, "step": 980}, {"eval_loss": 0.33634763956069946, "eval_token_acc": 0.7549212598425197, "eval_runtime": 1.2882, "eval_samples_per_second": 3.105, "eval_steps_per_second": 3.105, "epoch": 4.94949494949495, "step": 980}, {"loss": 0.13659827709197997, "token_acc": 0.9222326748196927, "grad_norm": 0.333428293466568, "learning_rate": 6.980940707146389e-09, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.354804, "epoch": 4.974747474747475, "step": 985}, {"loss": 0.016735257208347322, "token_acc": 0.9945504087193461, "grad_norm": 0.2640458941459656, "learning_rate": 0.0, "memory(GiB)": 129.1, "train_speed(iter/s)": 0.355027, "epoch": 5.0, "step": 990}, {"eval_loss": 0.3339096009731293, "eval_token_acc": 0.7559055118110236, "eval_runtime": 1.2791, "eval_samples_per_second": 3.127, "eval_steps_per_second": 3.127, "epoch": 5.0, "step": 990}, {"train_runtime": 2791.2937, "train_samples_per_second": 0.709, "train_steps_per_second": 0.355, "total_flos": 2.95876726119936e+17, "train_loss": 0.2376624894563598, "epoch": 5.0, "step": 990}], "memory": 129.099609375} diff --git a/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs/events.out.tfevents.1737736599.kml-task-547024-record-9965643-prod-worker-0.63720.0 b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs/events.out.tfevents.1737736599.kml-task-547024-record-9965643-prod-worker-0.63720.0 new file mode 100644 index 0000000000000000000000000000000000000000..0e98d619f4b88ed7e8c5e8c848a2c91d971b73c6 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-32b_400_0.5_sft_4200_rank16_epoch5_what/v1-20250124-163508/runs/events.out.tfevents.1737736599.kml-task-547024-record-9965643-prod-worker-0.63720.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8513f5557cd99b8a941abca39b005029debe0f0e79fb826d601025441ebfbda7 +size 98871 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0da9dd9821596948ce1ce2526dc00d1f67225da3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/README.md b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5a4f36c2d9a223f0b0ab73f95d272609e03a479 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "o_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..03cacb2618999c7385a5676111565a237b5123e5 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd29fd974babbe3211f8564fe5232d63ee2e2464b0d27fe99cac62d97fbba0d +size 828526568 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/additional_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0da9dd9821596948ce1ce2526dc00d1f67225da3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/optimizer.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9ed457229a09b342b1869c4a4109b74002ef28e --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d6f7dc66c6b97101ace50d94186dc56193ae6deea38b705d13c0a4b2c3c655 +size 1657698290 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/rng_state.pth b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a13477db4058a5f775bda0ea667a1996b4eac4f --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c767e22b1e6c34f7b81670a21ccd4c78f76558f5b80333707af538c5dee1c6be +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/scheduler.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..33f55927d7b0a02bd65e479741628f875fa71c41 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f37e2444a7953bc3746c9a7e9a6be0d0a4b61d5d232fa5838badcf9940a058b9 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/trainer_state.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a4d141fb8362ddc85b7f9c868ae60c61cebd93b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/trainer_state.json @@ -0,0 +1,1072 @@ +{ + "best_metric": 0.29718354, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420", + "epoch": 2.121212121212121, + "eval_steps": 20, + "global_step": 420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.5129857659339905, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.569739043712616, + "memory(GiB)": 144.03, + "step": 1, + "token_acc": 0.8513761467889909, + "train_speed(iter/s)": 0.135022 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.8691070675849915, + "learning_rate": 1e-05, + "loss": 0.7468794584274292, + "memory(GiB)": 153.42, + "step": 5, + "token_acc": 0.8297613248904043, + "train_speed(iter/s)": 0.212168 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.5965867638587952, + "learning_rate": 2e-05, + "loss": 0.7946175098419189, + "memory(GiB)": 160.41, + "step": 10, + "token_acc": 0.787320071162866, + "train_speed(iter/s)": 0.221849 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.548611044883728, + "learning_rate": 3e-05, + "loss": 0.7008682727813721, + "memory(GiB)": 170.14, + "step": 15, + "token_acc": 0.8016944665078104, + "train_speed(iter/s)": 0.218317 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 4.018751621246338, + "learning_rate": 4e-05, + "loss": 0.5246500968933105, + "memory(GiB)": 170.14, + "step": 20, + "token_acc": 0.8706467661691543, + "train_speed(iter/s)": 0.236511 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 0.7290887236595154, + "eval_runtime": 1.8127, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 2.207, + "eval_token_acc": 0.7212787212787213, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.45920249819755554, + "learning_rate": 5e-05, + "loss": 0.48354249000549315, + "memory(GiB)": 179.18, + "step": 25, + "token_acc": 0.8261477045908183, + "train_speed(iter/s)": 0.214846 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.1457232236862183, + "learning_rate": 6e-05, + "loss": 0.5703897476196289, + "memory(GiB)": 179.18, + "step": 30, + "token_acc": 0.8114154296466652, + "train_speed(iter/s)": 0.223062 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.3219285309314728, + "learning_rate": 7e-05, + "loss": 0.3709995269775391, + "memory(GiB)": 179.18, + "step": 35, + "token_acc": 0.8511754068716094, + "train_speed(iter/s)": 0.226934 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.2930394113063812, + "learning_rate": 8e-05, + "loss": 0.44092235565185545, + "memory(GiB)": 179.18, + "step": 40, + "token_acc": 0.8480160435467698, + "train_speed(iter/s)": 0.226914 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5398522615432739, + "eval_runtime": 1.8136, + "eval_samples_per_second": 2.206, + "eval_steps_per_second": 2.206, + "eval_token_acc": 0.7362637362637363, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.4423171281814575, + "learning_rate": 9e-05, + "loss": 0.4420435428619385, + "memory(GiB)": 179.18, + "step": 45, + "token_acc": 0.83846547314578, + "train_speed(iter/s)": 0.218055 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.356607049703598, + "learning_rate": 0.0001, + "loss": 0.5247397899627686, + "memory(GiB)": 179.18, + "step": 50, + "token_acc": 0.8684412312410998, + "train_speed(iter/s)": 0.213301 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.41687265038490295, + "learning_rate": 9.999301905929286e-05, + "loss": 0.46730861663818357, + "memory(GiB)": 179.18, + "step": 55, + "token_acc": 0.8472682119205298, + "train_speed(iter/s)": 0.21328 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.7148261666297913, + "learning_rate": 9.997207818651274e-05, + "loss": 0.36838181018829347, + "memory(GiB)": 179.18, + "step": 60, + "token_acc": 0.8551724137931035, + "train_speed(iter/s)": 0.218719 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5127567052841187, + "eval_runtime": 1.8179, + "eval_samples_per_second": 2.2, + "eval_steps_per_second": 2.2, + "eval_token_acc": 0.7422577422577422, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.6570205688476562, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5411728858947754, + "memory(GiB)": 179.18, + "step": 65, + "token_acc": 0.8298285714285715, + "train_speed(iter/s)": 0.213853 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.2901982367038727, + "learning_rate": 9.988834393115767e-05, + "loss": 0.39851596355438235, + "memory(GiB)": 179.18, + "step": 70, + "token_acc": 0.8732449297971919, + "train_speed(iter/s)": 0.211799 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.31978854537010193, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5697728157043457, + "memory(GiB)": 179.18, + "step": 75, + "token_acc": 0.8354898336414048, + "train_speed(iter/s)": 0.215997 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.3901304304599762, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6011258602142334, + "memory(GiB)": 179.25, + "step": 80, + "token_acc": 0.8381935097951249, + "train_speed(iter/s)": 0.216474 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5226491093635559, + "eval_runtime": 1.8268, + "eval_samples_per_second": 2.19, + "eval_steps_per_second": 2.19, + "eval_token_acc": 0.7392607392607392, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.27575281262397766, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5449016571044922, + "memory(GiB)": 179.25, + "step": 85, + "token_acc": 0.8128, + "train_speed(iter/s)": 0.213104 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.6381244659423828, + "learning_rate": 9.9553874407739e-05, + "loss": 0.4397461414337158, + "memory(GiB)": 179.25, + "step": 90, + "token_acc": 0.8472657610588645, + "train_speed(iter/s)": 0.214392 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.3371107280254364, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3532680034637451, + "memory(GiB)": 179.25, + "step": 95, + "token_acc": 0.8662573411639082, + "train_speed(iter/s)": 0.21374 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.3131839334964752, + "learning_rate": 9.930351269950143e-05, + "loss": 0.41753764152526857, + "memory(GiB)": 194.67, + "step": 100, + "token_acc": 0.8640469738030714, + "train_speed(iter/s)": 0.211078 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5102224349975586, + "eval_runtime": 1.8327, + "eval_samples_per_second": 2.183, + "eval_steps_per_second": 2.183, + "eval_token_acc": 0.7382617382617382, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.39832383394241333, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5459073543548584, + "memory(GiB)": 194.67, + "step": 105, + "token_acc": 0.8039112050739958, + "train_speed(iter/s)": 0.210332 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.300778865814209, + "learning_rate": 9.899808525182935e-05, + "loss": 0.5121739864349365, + "memory(GiB)": 194.67, + "step": 110, + "token_acc": 0.8097281831187411, + "train_speed(iter/s)": 0.214826 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.34517738223075867, + "learning_rate": 9.882482608435923e-05, + "loss": 0.451249361038208, + "memory(GiB)": 194.67, + "step": 115, + "token_acc": 0.8614628614628614, + "train_speed(iter/s)": 0.211594 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7278887033462524, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5085325717926026, + "memory(GiB)": 194.67, + "step": 120, + "token_acc": 0.8130574826560951, + "train_speed(iter/s)": 0.21067 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.509061336517334, + "eval_runtime": 1.7896, + "eval_samples_per_second": 2.235, + "eval_steps_per_second": 2.235, + "eval_token_acc": 0.7382617382617382, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.38017159700393677, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4689138889312744, + "memory(GiB)": 194.67, + "step": 125, + "token_acc": 0.8352281825460368, + "train_speed(iter/s)": 0.210631 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.40452879667282104, + "learning_rate": 9.822345875271883e-05, + "loss": 0.4758878707885742, + "memory(GiB)": 194.67, + "step": 130, + "token_acc": 0.8449714013346044, + "train_speed(iter/s)": 0.209823 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.41681820154190063, + "learning_rate": 9.799599295015154e-05, + "loss": 0.3720943212509155, + "memory(GiB)": 194.67, + "step": 135, + "token_acc": 0.8728339854667412, + "train_speed(iter/s)": 0.209914 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.43215978145599365, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5305635452270507, + "memory(GiB)": 194.67, + "step": 140, + "token_acc": 0.8278240499739719, + "train_speed(iter/s)": 0.20961 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.4448011815547943, + "eval_runtime": 1.8087, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7502497502497503, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.5140364766120911, + "learning_rate": 9.750092174273521e-05, + "loss": 0.34435036182403567, + "memory(GiB)": 194.67, + "step": 145, + "token_acc": 0.8617477760334903, + "train_speed(iter/s)": 0.207394 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.8977436423301697, + "learning_rate": 9.723345458039594e-05, + "loss": 0.4015669345855713, + "memory(GiB)": 194.67, + "step": 150, + "token_acc": 0.8720949673967564, + "train_speed(iter/s)": 0.20834 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.6019405126571655, + "learning_rate": 9.69527980602239e-05, + "loss": 0.4199058055877686, + "memory(GiB)": 194.67, + "step": 155, + "token_acc": 0.8565537923278771, + "train_speed(iter/s)": 0.20801 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.41383394598960876, + "learning_rate": 9.665903055208014e-05, + "loss": 0.34770309925079346, + "memory(GiB)": 194.67, + "step": 160, + "token_acc": 0.8795408083031924, + "train_speed(iter/s)": 0.208783 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.4233054220676422, + "eval_runtime": 1.8491, + "eval_samples_per_second": 2.163, + "eval_steps_per_second": 2.163, + "eval_token_acc": 0.7442557442557443, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.2816776931285858, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4095714569091797, + "memory(GiB)": 194.67, + "step": 165, + "token_acc": 0.8456421395601412, + "train_speed(iter/s)": 0.20789 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.41569507122039795, + "learning_rate": 9.603249433382144e-05, + "loss": 0.45749435424804685, + "memory(GiB)": 194.67, + "step": 170, + "token_acc": 0.8532716457369465, + "train_speed(iter/s)": 0.20746 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.4932589828968048, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4164144515991211, + "memory(GiB)": 194.67, + "step": 175, + "token_acc": 0.8524394404640054, + "train_speed(iter/s)": 0.208373 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3632556200027466, + "learning_rate": 9.535454568671704e-05, + "loss": 0.422211742401123, + "memory(GiB)": 194.67, + "step": 180, + "token_acc": 0.8620525059665871, + "train_speed(iter/s)": 0.208791 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.45568227767944336, + "eval_runtime": 1.8409, + "eval_samples_per_second": 2.173, + "eval_steps_per_second": 2.173, + "eval_token_acc": 0.7582417582417582, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.4775158166885376, + "learning_rate": 9.49965261014704e-05, + "loss": 0.49337053298950195, + "memory(GiB)": 194.67, + "step": 185, + "token_acc": 0.8209519012843113, + "train_speed(iter/s)": 0.207976 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.7428700923919678, + "learning_rate": 9.462594179299406e-05, + "loss": 0.7271251678466797, + "memory(GiB)": 194.67, + "step": 190, + "token_acc": 0.7965624119470274, + "train_speed(iter/s)": 0.209678 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.46910813450813293, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5494725227355957, + "memory(GiB)": 194.67, + "step": 195, + "token_acc": 0.8197339246119734, + "train_speed(iter/s)": 0.208894 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.2979983389377594, + "learning_rate": 9.384749641033359e-05, + "loss": 0.4731719970703125, + "memory(GiB)": 194.67, + "step": 200, + "token_acc": 0.8567099230709457, + "train_speed(iter/s)": 0.20626 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.499001145362854, + "eval_runtime": 1.8407, + "eval_samples_per_second": 2.173, + "eval_steps_per_second": 2.173, + "eval_token_acc": 0.7502497502497503, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.38004475831985474, + "learning_rate": 9.343985270739182e-05, + "loss": 0.42464404106140136, + "memory(GiB)": 194.67, + "step": 205, + "token_acc": 0.8373205741626795, + "train_speed(iter/s)": 0.203763 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.4347917437553406, + "learning_rate": 9.302007896300698e-05, + "loss": 0.3545402765274048, + "memory(GiB)": 194.67, + "step": 210, + "token_acc": 0.8774455518641565, + "train_speed(iter/s)": 0.203468 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.38388073444366455, + "learning_rate": 9.25882923938038e-05, + "loss": 0.32962794303894044, + "memory(GiB)": 194.67, + "step": 215, + "token_acc": 0.8891755236817666, + "train_speed(iter/s)": 0.202581 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5489069223403931, + "learning_rate": 9.214461357083985e-05, + "loss": 0.2972090482711792, + "memory(GiB)": 194.67, + "step": 220, + "token_acc": 0.9002027809965237, + "train_speed(iter/s)": 0.202983 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.5020791292190552, + "eval_runtime": 1.8362, + "eval_samples_per_second": 2.178, + "eval_steps_per_second": 2.178, + "eval_token_acc": 0.7532467532467533, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.602890133857727, + "learning_rate": 9.168916638593736e-05, + "loss": 0.42758522033691404, + "memory(GiB)": 194.67, + "step": 225, + "token_acc": 0.8501907293954456, + "train_speed(iter/s)": 0.202184 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.39421549439430237, + "learning_rate": 9.122207801708802e-05, + "loss": 0.36005940437316897, + "memory(GiB)": 194.67, + "step": 230, + "token_acc": 0.8754238800642513, + "train_speed(iter/s)": 0.200825 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.46965521574020386, + "learning_rate": 9.074347889294016e-05, + "loss": 0.15720741748809813, + "memory(GiB)": 194.67, + "step": 235, + "token_acc": 0.9282419272168119, + "train_speed(iter/s)": 0.202308 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.7477562427520752, + "learning_rate": 9.025350265637815e-05, + "loss": 0.3855461120605469, + "memory(GiB)": 194.67, + "step": 240, + "token_acc": 0.8711264141662568, + "train_speed(iter/s)": 0.202911 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5334002375602722, + "eval_runtime": 1.8268, + "eval_samples_per_second": 2.19, + "eval_steps_per_second": 2.19, + "eval_token_acc": 0.7442557442557443, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.5034751892089844, + "learning_rate": 8.975228612720416e-05, + "loss": 0.24191198348999024, + "memory(GiB)": 194.67, + "step": 245, + "token_acc": 0.8654323028599769, + "train_speed(iter/s)": 0.203143 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.7961943745613098, + "learning_rate": 8.923996926393305e-05, + "loss": 0.3798638105392456, + "memory(GiB)": 194.67, + "step": 250, + "token_acc": 0.8645948945615982, + "train_speed(iter/s)": 0.204558 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.4535170793533325, + "learning_rate": 8.871669512471068e-05, + "loss": 0.3437025547027588, + "memory(GiB)": 194.67, + "step": 255, + "token_acc": 0.87003341997772, + "train_speed(iter/s)": 0.203403 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.7094895243644714, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3248720645904541, + "memory(GiB)": 194.67, + "step": 260, + "token_acc": 0.8757570513929746, + "train_speed(iter/s)": 0.204064 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4627455472946167, + "eval_runtime": 1.8037, + "eval_samples_per_second": 2.218, + "eval_steps_per_second": 2.218, + "eval_token_acc": 0.7502497502497503, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.22084304690361023, + "learning_rate": 8.763786250861256e-05, + "loss": 0.2554438352584839, + "memory(GiB)": 194.67, + "step": 265, + "token_acc": 0.9007541995200549, + "train_speed(iter/s)": 0.201941 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.44375061988830566, + "learning_rate": 8.708260528239788e-05, + "loss": 0.23369157314300537, + "memory(GiB)": 194.67, + "step": 270, + "token_acc": 0.910726525017135, + "train_speed(iter/s)": 0.202564 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.42193475365638733, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2531747817993164, + "memory(GiB)": 194.67, + "step": 275, + "token_acc": 0.9063345966432051, + "train_speed(iter/s)": 0.20219 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.8283807635307312, + "learning_rate": 8.594118419389647e-05, + "loss": 0.37001192569732666, + "memory(GiB)": 194.67, + "step": 280, + "token_acc": 0.8865761157170576, + "train_speed(iter/s)": 0.20285 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.427340030670166, + "eval_runtime": 1.8284, + "eval_samples_per_second": 2.188, + "eval_steps_per_second": 2.188, + "eval_token_acc": 0.7552447552447552, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.7095621824264526, + "learning_rate": 8.535533905932738e-05, + "loss": 0.18019050359725952, + "memory(GiB)": 194.67, + "step": 285, + "token_acc": 0.9067105947633085, + "train_speed(iter/s)": 0.203001 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.7720094323158264, + "learning_rate": 8.475962138373213e-05, + "loss": 0.33577933311462405, + "memory(GiB)": 194.67, + "step": 290, + "token_acc": 0.8656873032528857, + "train_speed(iter/s)": 0.203902 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 1.2524478435516357, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3461789131164551, + "memory(GiB)": 194.67, + "step": 295, + "token_acc": 0.8515314472761282, + "train_speed(iter/s)": 0.204926 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 2.095057725906372, + "learning_rate": 8.353923650696118e-05, + "loss": 0.28853349685668944, + "memory(GiB)": 194.67, + "step": 300, + "token_acc": 0.8904656319290466, + "train_speed(iter/s)": 0.205093 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.43511924147605896, + "eval_runtime": 1.8124, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 2.207, + "eval_token_acc": 0.7432567432567433, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.516942024230957, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3344187498092651, + "memory(GiB)": 194.67, + "step": 305, + "token_acc": 0.8720779866706456, + "train_speed(iter/s)": 0.20388 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.5044359564781189, + "learning_rate": 8.228139257794012e-05, + "loss": 0.2534762382507324, + "memory(GiB)": 194.68, + "step": 310, + "token_acc": 0.9083986562150056, + "train_speed(iter/s)": 0.20477 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 1.4840720891952515, + "learning_rate": 8.163886089321493e-05, + "loss": 0.23027021884918214, + "memory(GiB)": 194.68, + "step": 315, + "token_acc": 0.9053683385579937, + "train_speed(iter/s)": 0.205493 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.47270119190216064, + "learning_rate": 8.098749444801224e-05, + "loss": 0.2922214031219482, + "memory(GiB)": 194.68, + "step": 320, + "token_acc": 0.9086255041886441, + "train_speed(iter/s)": 0.205726 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.375497043132782, + "eval_runtime": 1.7963, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 2.227, + "eval_token_acc": 0.7572427572427572, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.5327674746513367, + "learning_rate": 8.032747512835337e-05, + "loss": 0.2936864376068115, + "memory(GiB)": 194.68, + "step": 325, + "token_acc": 0.8697649283977303, + "train_speed(iter/s)": 0.205415 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7329817414283752, + "learning_rate": 7.965898723646776e-05, + "loss": 0.3568688154220581, + "memory(GiB)": 194.68, + "step": 330, + "token_acc": 0.8953463435556509, + "train_speed(iter/s)": 0.20625 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.46463268995285034, + "learning_rate": 7.898221743932888e-05, + "loss": 0.28799734115600584, + "memory(GiB)": 194.68, + "step": 335, + "token_acc": 0.8915866741953699, + "train_speed(iter/s)": 0.206245 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.9827662110328674, + "learning_rate": 7.829735471652978e-05, + "loss": 0.19362801313400269, + "memory(GiB)": 194.68, + "step": 340, + "token_acc": 0.9217057761732852, + "train_speed(iter/s)": 0.207024 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.36126405000686646, + "eval_runtime": 1.7972, + "eval_samples_per_second": 2.226, + "eval_steps_per_second": 2.226, + "eval_token_acc": 0.7492507492507493, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.5373030304908752, + "learning_rate": 7.760459030751284e-05, + "loss": 0.24427511692047119, + "memory(GiB)": 194.68, + "step": 345, + "token_acc": 0.9034146341463415, + "train_speed(iter/s)": 0.20636 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.4386836290359497, + "learning_rate": 7.690411765816864e-05, + "loss": 0.1736771821975708, + "memory(GiB)": 194.68, + "step": 350, + "token_acc": 0.9324116743471582, + "train_speed(iter/s)": 0.207166 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.9116824269294739, + "learning_rate": 7.619613236681843e-05, + "loss": 0.3541959285736084, + "memory(GiB)": 194.68, + "step": 355, + "token_acc": 0.8842233999184672, + "train_speed(iter/s)": 0.207241 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.32813596725463867, + "learning_rate": 7.548083212959588e-05, + "loss": 0.2200457811355591, + "memory(GiB)": 194.68, + "step": 360, + "token_acc": 0.9080932784636488, + "train_speed(iter/s)": 0.207214 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.35305824875831604, + "eval_runtime": 1.8366, + "eval_samples_per_second": 2.178, + "eval_steps_per_second": 2.178, + "eval_token_acc": 0.7532467532467533, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.9034874439239502, + "learning_rate": 7.475841668524268e-05, + "loss": 0.28913445472717286, + "memory(GiB)": 194.68, + "step": 365, + "token_acc": 0.870794734275963, + "train_speed(iter/s)": 0.207182 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.37799835205078125, + "learning_rate": 7.402908775933419e-05, + "loss": 0.29883947372436526, + "memory(GiB)": 194.68, + "step": 370, + "token_acc": 0.8907584448693435, + "train_speed(iter/s)": 0.207123 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.5515828132629395, + "learning_rate": 7.329304900794991e-05, + "loss": 0.34903314113616946, + "memory(GiB)": 194.68, + "step": 375, + "token_acc": 0.8823183635081119, + "train_speed(iter/s)": 0.206817 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.7102660536766052, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3167442321777344, + "memory(GiB)": 194.68, + "step": 380, + "token_acc": 0.8950437317784257, + "train_speed(iter/s)": 0.206935 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.31274980306625366, + "eval_runtime": 1.8255, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 2.191, + "eval_token_acc": 0.7632367632367633, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.24322476983070374, + "learning_rate": 7.180166596385914e-05, + "loss": 0.33083691596984866, + "memory(GiB)": 194.68, + "step": 385, + "token_acc": 0.8845164609053497, + "train_speed(iter/s)": 0.206393 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.3991744816303253, + "learning_rate": 7.104673812141675e-05, + "loss": 0.23392996788024903, + "memory(GiB)": 194.68, + "step": 390, + "token_acc": 0.9032924310533349, + "train_speed(iter/s)": 0.206252 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.38407906889915466, + "learning_rate": 7.02859332377382e-05, + "loss": 0.19444363117218016, + "memory(GiB)": 194.68, + "step": 395, + "token_acc": 0.9170926872638364, + "train_speed(iter/s)": 0.206907 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5186311602592468, + "learning_rate": 6.951946375817474e-05, + "loss": 0.17200204133987426, + "memory(GiB)": 194.68, + "step": 400, + "token_acc": 0.9564939219449776, + "train_speed(iter/s)": 0.207883 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3031849265098572, + "eval_runtime": 1.7906, + "eval_samples_per_second": 2.234, + "eval_steps_per_second": 2.234, + "eval_token_acc": 0.7542457542457542, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6981443166732788, + "learning_rate": 6.874754370984606e-05, + "loss": 0.12331185340881348, + "memory(GiB)": 194.68, + "step": 405, + "token_acc": 0.9200656994251301, + "train_speed(iter/s)": 0.207599 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.9956308603286743, + "learning_rate": 6.797038864187564e-05, + "loss": 0.12755507230758667, + "memory(GiB)": 194.68, + "step": 410, + "token_acc": 0.9337340775726349, + "train_speed(iter/s)": 0.207712 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.5786072611808777, + "learning_rate": 6.718821556520151e-05, + "loss": 0.12629324197769165, + "memory(GiB)": 194.68, + "step": 415, + "token_acc": 0.9433671220802116, + "train_speed(iter/s)": 0.208254 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.40014225244522095, + "learning_rate": 6.640124289197845e-05, + "loss": 0.08971643447875977, + "memory(GiB)": 194.68, + "step": 420, + "token_acc": 0.9842690534309737, + "train_speed(iter/s)": 0.208984 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.2971835434436798, + "eval_runtime": 1.8393, + "eval_samples_per_second": 2.175, + "eval_steps_per_second": 2.175, + "eval_token_acc": 0.7622377622377622, + "step": 420 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6727113688131174e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/training_args.bin b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..84723354b7c97cc0656161a46b521532fa73cd32 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ad8b7f2148494eb493ac7d940a063889b4311c7edf4791f9b3ad8178a343be +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/README.md b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5a4f36c2d9a223f0b0ab73f95d272609e03a479 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "o_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1d2da327ce547ad69473e207bc15446148215515 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df21dfb3c95f11a93998d48bb7b3b041d0a084b43474b7f15a9d4ea458e5e604 +size 828526568 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/additional_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0da9dd9821596948ce1ce2526dc00d1f67225da3 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_random20.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/optimizer.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..36f3888dd490d8c9eef16eeec35106603dad89f1 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d643ff296892bc5414ad99ca0d39f3c497c92d57117eec4d6f89c97d41d3faf +size 1657698290 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/rng_state.pth b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7bfe0fd93c58e3405a13c5032495e503a5e3f47d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c940ef346647e64b4c2275338df437336d6d5adb003d4210dc70e9e174f44db +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/scheduler.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f8882c16fb0abc091aaea5286781182c084d87d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b3a16451354ac84ec594942621c3011b01d575ac8a6b2fa4481b0291c904a7 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/trainer_state.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf7ce194c38d3a21f3d35a8bc430b1ba7db339d1 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/trainer_state.json @@ -0,0 +1,2473 @@ +{ + "best_metric": 0.29718354, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 990, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.5129857659339905, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.569739043712616, + "memory(GiB)": 144.03, + "step": 1, + "token_acc": 0.8513761467889909, + "train_speed(iter/s)": 0.135022 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.8691070675849915, + "learning_rate": 1e-05, + "loss": 0.7468794584274292, + "memory(GiB)": 153.42, + "step": 5, + "token_acc": 0.8297613248904043, + "train_speed(iter/s)": 0.212168 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.5965867638587952, + "learning_rate": 2e-05, + "loss": 0.7946175098419189, + "memory(GiB)": 160.41, + "step": 10, + "token_acc": 0.787320071162866, + "train_speed(iter/s)": 0.221849 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.548611044883728, + "learning_rate": 3e-05, + "loss": 0.7008682727813721, + "memory(GiB)": 170.14, + "step": 15, + "token_acc": 0.8016944665078104, + "train_speed(iter/s)": 0.218317 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 4.018751621246338, + "learning_rate": 4e-05, + "loss": 0.5246500968933105, + "memory(GiB)": 170.14, + "step": 20, + "token_acc": 0.8706467661691543, + "train_speed(iter/s)": 0.236511 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 0.7290887236595154, + "eval_runtime": 1.8127, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 2.207, + "eval_token_acc": 0.7212787212787213, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.45920249819755554, + "learning_rate": 5e-05, + "loss": 0.48354249000549315, + "memory(GiB)": 179.18, + "step": 25, + "token_acc": 0.8261477045908183, + "train_speed(iter/s)": 0.214846 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.1457232236862183, + "learning_rate": 6e-05, + "loss": 0.5703897476196289, + "memory(GiB)": 179.18, + "step": 30, + "token_acc": 0.8114154296466652, + "train_speed(iter/s)": 0.223062 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.3219285309314728, + "learning_rate": 7e-05, + "loss": 0.3709995269775391, + "memory(GiB)": 179.18, + "step": 35, + "token_acc": 0.8511754068716094, + "train_speed(iter/s)": 0.226934 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.2930394113063812, + "learning_rate": 8e-05, + "loss": 0.44092235565185545, + "memory(GiB)": 179.18, + "step": 40, + "token_acc": 0.8480160435467698, + "train_speed(iter/s)": 0.226914 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5398522615432739, + "eval_runtime": 1.8136, + "eval_samples_per_second": 2.206, + "eval_steps_per_second": 2.206, + "eval_token_acc": 0.7362637362637363, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.4423171281814575, + "learning_rate": 9e-05, + "loss": 0.4420435428619385, + "memory(GiB)": 179.18, + "step": 45, + "token_acc": 0.83846547314578, + "train_speed(iter/s)": 0.218055 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.356607049703598, + "learning_rate": 0.0001, + "loss": 0.5247397899627686, + "memory(GiB)": 179.18, + "step": 50, + "token_acc": 0.8684412312410998, + "train_speed(iter/s)": 0.213301 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.41687265038490295, + "learning_rate": 9.999301905929286e-05, + "loss": 0.46730861663818357, + "memory(GiB)": 179.18, + "step": 55, + "token_acc": 0.8472682119205298, + "train_speed(iter/s)": 0.21328 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.7148261666297913, + "learning_rate": 9.997207818651274e-05, + "loss": 0.36838181018829347, + "memory(GiB)": 179.18, + "step": 60, + "token_acc": 0.8551724137931035, + "train_speed(iter/s)": 0.218719 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.5127567052841187, + "eval_runtime": 1.8179, + "eval_samples_per_second": 2.2, + "eval_steps_per_second": 2.2, + "eval_token_acc": 0.7422577422577422, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.6570205688476562, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5411728858947754, + "memory(GiB)": 179.18, + "step": 65, + "token_acc": 0.8298285714285715, + "train_speed(iter/s)": 0.213853 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.2901982367038727, + "learning_rate": 9.988834393115767e-05, + "loss": 0.39851596355438235, + "memory(GiB)": 179.18, + "step": 70, + "token_acc": 0.8732449297971919, + "train_speed(iter/s)": 0.211799 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.31978854537010193, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5697728157043457, + "memory(GiB)": 179.18, + "step": 75, + "token_acc": 0.8354898336414048, + "train_speed(iter/s)": 0.215997 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.3901304304599762, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6011258602142334, + "memory(GiB)": 179.25, + "step": 80, + "token_acc": 0.8381935097951249, + "train_speed(iter/s)": 0.216474 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5226491093635559, + "eval_runtime": 1.8268, + "eval_samples_per_second": 2.19, + "eval_steps_per_second": 2.19, + "eval_token_acc": 0.7392607392607392, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.27575281262397766, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5449016571044922, + "memory(GiB)": 179.25, + "step": 85, + "token_acc": 0.8128, + "train_speed(iter/s)": 0.213104 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.6381244659423828, + "learning_rate": 9.9553874407739e-05, + "loss": 0.4397461414337158, + "memory(GiB)": 179.25, + "step": 90, + "token_acc": 0.8472657610588645, + "train_speed(iter/s)": 0.214392 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.3371107280254364, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3532680034637451, + "memory(GiB)": 179.25, + "step": 95, + "token_acc": 0.8662573411639082, + "train_speed(iter/s)": 0.21374 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.3131839334964752, + "learning_rate": 9.930351269950143e-05, + "loss": 0.41753764152526857, + "memory(GiB)": 194.67, + "step": 100, + "token_acc": 0.8640469738030714, + "train_speed(iter/s)": 0.211078 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5102224349975586, + "eval_runtime": 1.8327, + "eval_samples_per_second": 2.183, + "eval_steps_per_second": 2.183, + "eval_token_acc": 0.7382617382617382, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.39832383394241333, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5459073543548584, + "memory(GiB)": 194.67, + "step": 105, + "token_acc": 0.8039112050739958, + "train_speed(iter/s)": 0.210332 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.300778865814209, + "learning_rate": 9.899808525182935e-05, + "loss": 0.5121739864349365, + "memory(GiB)": 194.67, + "step": 110, + "token_acc": 0.8097281831187411, + "train_speed(iter/s)": 0.214826 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.34517738223075867, + "learning_rate": 9.882482608435923e-05, + "loss": 0.451249361038208, + "memory(GiB)": 194.67, + "step": 115, + "token_acc": 0.8614628614628614, + "train_speed(iter/s)": 0.211594 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7278887033462524, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5085325717926026, + "memory(GiB)": 194.67, + "step": 120, + "token_acc": 0.8130574826560951, + "train_speed(iter/s)": 0.21067 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.509061336517334, + "eval_runtime": 1.7896, + "eval_samples_per_second": 2.235, + "eval_steps_per_second": 2.235, + "eval_token_acc": 0.7382617382617382, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.38017159700393677, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4689138889312744, + "memory(GiB)": 194.67, + "step": 125, + "token_acc": 0.8352281825460368, + "train_speed(iter/s)": 0.210631 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.40452879667282104, + "learning_rate": 9.822345875271883e-05, + "loss": 0.4758878707885742, + "memory(GiB)": 194.67, + "step": 130, + "token_acc": 0.8449714013346044, + "train_speed(iter/s)": 0.209823 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.41681820154190063, + "learning_rate": 9.799599295015154e-05, + "loss": 0.3720943212509155, + "memory(GiB)": 194.67, + "step": 135, + "token_acc": 0.8728339854667412, + "train_speed(iter/s)": 0.209914 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.43215978145599365, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5305635452270507, + "memory(GiB)": 194.67, + "step": 140, + "token_acc": 0.8278240499739719, + "train_speed(iter/s)": 0.20961 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.4448011815547943, + "eval_runtime": 1.8087, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7502497502497503, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.5140364766120911, + "learning_rate": 9.750092174273521e-05, + "loss": 0.34435036182403567, + "memory(GiB)": 194.67, + "step": 145, + "token_acc": 0.8617477760334903, + "train_speed(iter/s)": 0.207394 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.8977436423301697, + "learning_rate": 9.723345458039594e-05, + "loss": 0.4015669345855713, + "memory(GiB)": 194.67, + "step": 150, + "token_acc": 0.8720949673967564, + "train_speed(iter/s)": 0.20834 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.6019405126571655, + "learning_rate": 9.69527980602239e-05, + "loss": 0.4199058055877686, + "memory(GiB)": 194.67, + "step": 155, + "token_acc": 0.8565537923278771, + "train_speed(iter/s)": 0.20801 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.41383394598960876, + "learning_rate": 9.665903055208014e-05, + "loss": 0.34770309925079346, + "memory(GiB)": 194.67, + "step": 160, + "token_acc": 0.8795408083031924, + "train_speed(iter/s)": 0.208783 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.4233054220676422, + "eval_runtime": 1.8491, + "eval_samples_per_second": 2.163, + "eval_steps_per_second": 2.163, + "eval_token_acc": 0.7442557442557443, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.2816776931285858, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4095714569091797, + "memory(GiB)": 194.67, + "step": 165, + "token_acc": 0.8456421395601412, + "train_speed(iter/s)": 0.20789 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.41569507122039795, + "learning_rate": 9.603249433382144e-05, + "loss": 0.45749435424804685, + "memory(GiB)": 194.67, + "step": 170, + "token_acc": 0.8532716457369465, + "train_speed(iter/s)": 0.20746 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.4932589828968048, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4164144515991211, + "memory(GiB)": 194.67, + "step": 175, + "token_acc": 0.8524394404640054, + "train_speed(iter/s)": 0.208373 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3632556200027466, + "learning_rate": 9.535454568671704e-05, + "loss": 0.422211742401123, + "memory(GiB)": 194.67, + "step": 180, + "token_acc": 0.8620525059665871, + "train_speed(iter/s)": 0.208791 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.45568227767944336, + "eval_runtime": 1.8409, + "eval_samples_per_second": 2.173, + "eval_steps_per_second": 2.173, + "eval_token_acc": 0.7582417582417582, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.4775158166885376, + "learning_rate": 9.49965261014704e-05, + "loss": 0.49337053298950195, + "memory(GiB)": 194.67, + "step": 185, + "token_acc": 0.8209519012843113, + "train_speed(iter/s)": 0.207976 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.7428700923919678, + "learning_rate": 9.462594179299406e-05, + "loss": 0.7271251678466797, + "memory(GiB)": 194.67, + "step": 190, + "token_acc": 0.7965624119470274, + "train_speed(iter/s)": 0.209678 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.46910813450813293, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5494725227355957, + "memory(GiB)": 194.67, + "step": 195, + "token_acc": 0.8197339246119734, + "train_speed(iter/s)": 0.208894 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.2979983389377594, + "learning_rate": 9.384749641033359e-05, + "loss": 0.4731719970703125, + "memory(GiB)": 194.67, + "step": 200, + "token_acc": 0.8567099230709457, + "train_speed(iter/s)": 0.20626 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.499001145362854, + "eval_runtime": 1.8407, + "eval_samples_per_second": 2.173, + "eval_steps_per_second": 2.173, + "eval_token_acc": 0.7502497502497503, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.38004475831985474, + "learning_rate": 9.343985270739182e-05, + "loss": 0.42464404106140136, + "memory(GiB)": 194.67, + "step": 205, + "token_acc": 0.8373205741626795, + "train_speed(iter/s)": 0.203763 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.4347917437553406, + "learning_rate": 9.302007896300698e-05, + "loss": 0.3545402765274048, + "memory(GiB)": 194.67, + "step": 210, + "token_acc": 0.8774455518641565, + "train_speed(iter/s)": 0.203468 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.38388073444366455, + "learning_rate": 9.25882923938038e-05, + "loss": 0.32962794303894044, + "memory(GiB)": 194.67, + "step": 215, + "token_acc": 0.8891755236817666, + "train_speed(iter/s)": 0.202581 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5489069223403931, + "learning_rate": 9.214461357083985e-05, + "loss": 0.2972090482711792, + "memory(GiB)": 194.67, + "step": 220, + "token_acc": 0.9002027809965237, + "train_speed(iter/s)": 0.202983 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.5020791292190552, + "eval_runtime": 1.8362, + "eval_samples_per_second": 2.178, + "eval_steps_per_second": 2.178, + "eval_token_acc": 0.7532467532467533, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.602890133857727, + "learning_rate": 9.168916638593736e-05, + "loss": 0.42758522033691404, + "memory(GiB)": 194.67, + "step": 225, + "token_acc": 0.8501907293954456, + "train_speed(iter/s)": 0.202184 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.39421549439430237, + "learning_rate": 9.122207801708802e-05, + "loss": 0.36005940437316897, + "memory(GiB)": 194.67, + "step": 230, + "token_acc": 0.8754238800642513, + "train_speed(iter/s)": 0.200825 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.46965521574020386, + "learning_rate": 9.074347889294016e-05, + "loss": 0.15720741748809813, + "memory(GiB)": 194.67, + "step": 235, + "token_acc": 0.9282419272168119, + "train_speed(iter/s)": 0.202308 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.7477562427520752, + "learning_rate": 9.025350265637815e-05, + "loss": 0.3855461120605469, + "memory(GiB)": 194.67, + "step": 240, + "token_acc": 0.8711264141662568, + "train_speed(iter/s)": 0.202911 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5334002375602722, + "eval_runtime": 1.8268, + "eval_samples_per_second": 2.19, + "eval_steps_per_second": 2.19, + "eval_token_acc": 0.7442557442557443, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.5034751892089844, + "learning_rate": 8.975228612720416e-05, + "loss": 0.24191198348999024, + "memory(GiB)": 194.67, + "step": 245, + "token_acc": 0.8654323028599769, + "train_speed(iter/s)": 0.203143 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.7961943745613098, + "learning_rate": 8.923996926393305e-05, + "loss": 0.3798638105392456, + "memory(GiB)": 194.67, + "step": 250, + "token_acc": 0.8645948945615982, + "train_speed(iter/s)": 0.204558 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.4535170793533325, + "learning_rate": 8.871669512471068e-05, + "loss": 0.3437025547027588, + "memory(GiB)": 194.67, + "step": 255, + "token_acc": 0.87003341997772, + "train_speed(iter/s)": 0.203403 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.7094895243644714, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3248720645904541, + "memory(GiB)": 194.67, + "step": 260, + "token_acc": 0.8757570513929746, + "train_speed(iter/s)": 0.204064 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4627455472946167, + "eval_runtime": 1.8037, + "eval_samples_per_second": 2.218, + "eval_steps_per_second": 2.218, + "eval_token_acc": 0.7502497502497503, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.22084304690361023, + "learning_rate": 8.763786250861256e-05, + "loss": 0.2554438352584839, + "memory(GiB)": 194.67, + "step": 265, + "token_acc": 0.9007541995200549, + "train_speed(iter/s)": 0.201941 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.44375061988830566, + "learning_rate": 8.708260528239788e-05, + "loss": 0.23369157314300537, + "memory(GiB)": 194.67, + "step": 270, + "token_acc": 0.910726525017135, + "train_speed(iter/s)": 0.202564 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.42193475365638733, + "learning_rate": 8.651699319743347e-05, + "loss": 0.2531747817993164, + "memory(GiB)": 194.67, + "step": 275, + "token_acc": 0.9063345966432051, + "train_speed(iter/s)": 0.20219 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.8283807635307312, + "learning_rate": 8.594118419389647e-05, + "loss": 0.37001192569732666, + "memory(GiB)": 194.67, + "step": 280, + "token_acc": 0.8865761157170576, + "train_speed(iter/s)": 0.20285 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.427340030670166, + "eval_runtime": 1.8284, + "eval_samples_per_second": 2.188, + "eval_steps_per_second": 2.188, + "eval_token_acc": 0.7552447552447552, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.7095621824264526, + "learning_rate": 8.535533905932738e-05, + "loss": 0.18019050359725952, + "memory(GiB)": 194.67, + "step": 285, + "token_acc": 0.9067105947633085, + "train_speed(iter/s)": 0.203001 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.7720094323158264, + "learning_rate": 8.475962138373213e-05, + "loss": 0.33577933311462405, + "memory(GiB)": 194.67, + "step": 290, + "token_acc": 0.8656873032528857, + "train_speed(iter/s)": 0.203902 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 1.2524478435516357, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3461789131164551, + "memory(GiB)": 194.67, + "step": 295, + "token_acc": 0.8515314472761282, + "train_speed(iter/s)": 0.204926 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 2.095057725906372, + "learning_rate": 8.353923650696118e-05, + "loss": 0.28853349685668944, + "memory(GiB)": 194.67, + "step": 300, + "token_acc": 0.8904656319290466, + "train_speed(iter/s)": 0.205093 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.43511924147605896, + "eval_runtime": 1.8124, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 2.207, + "eval_token_acc": 0.7432567432567433, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.516942024230957, + "learning_rate": 8.291491008316409e-05, + "loss": 0.3344187498092651, + "memory(GiB)": 194.67, + "step": 305, + "token_acc": 0.8720779866706456, + "train_speed(iter/s)": 0.20388 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.5044359564781189, + "learning_rate": 8.228139257794012e-05, + "loss": 0.2534762382507324, + "memory(GiB)": 194.68, + "step": 310, + "token_acc": 0.9083986562150056, + "train_speed(iter/s)": 0.20477 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 1.4840720891952515, + "learning_rate": 8.163886089321493e-05, + "loss": 0.23027021884918214, + "memory(GiB)": 194.68, + "step": 315, + "token_acc": 0.9053683385579937, + "train_speed(iter/s)": 0.205493 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.47270119190216064, + "learning_rate": 8.098749444801224e-05, + "loss": 0.2922214031219482, + "memory(GiB)": 194.68, + "step": 320, + "token_acc": 0.9086255041886441, + "train_speed(iter/s)": 0.205726 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.375497043132782, + "eval_runtime": 1.7963, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 2.227, + "eval_token_acc": 0.7572427572427572, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.5327674746513367, + "learning_rate": 8.032747512835337e-05, + "loss": 0.2936864376068115, + "memory(GiB)": 194.68, + "step": 325, + "token_acc": 0.8697649283977303, + "train_speed(iter/s)": 0.205415 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7329817414283752, + "learning_rate": 7.965898723646776e-05, + "loss": 0.3568688154220581, + "memory(GiB)": 194.68, + "step": 330, + "token_acc": 0.8953463435556509, + "train_speed(iter/s)": 0.20625 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.46463268995285034, + "learning_rate": 7.898221743932888e-05, + "loss": 0.28799734115600584, + "memory(GiB)": 194.68, + "step": 335, + "token_acc": 0.8915866741953699, + "train_speed(iter/s)": 0.206245 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.9827662110328674, + "learning_rate": 7.829735471652978e-05, + "loss": 0.19362801313400269, + "memory(GiB)": 194.68, + "step": 340, + "token_acc": 0.9217057761732852, + "train_speed(iter/s)": 0.207024 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.36126405000686646, + "eval_runtime": 1.7972, + "eval_samples_per_second": 2.226, + "eval_steps_per_second": 2.226, + "eval_token_acc": 0.7492507492507493, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.5373030304908752, + "learning_rate": 7.760459030751284e-05, + "loss": 0.24427511692047119, + "memory(GiB)": 194.68, + "step": 345, + "token_acc": 0.9034146341463415, + "train_speed(iter/s)": 0.20636 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.4386836290359497, + "learning_rate": 7.690411765816864e-05, + "loss": 0.1736771821975708, + "memory(GiB)": 194.68, + "step": 350, + "token_acc": 0.9324116743471582, + "train_speed(iter/s)": 0.207166 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.9116824269294739, + "learning_rate": 7.619613236681843e-05, + "loss": 0.3541959285736084, + "memory(GiB)": 194.68, + "step": 355, + "token_acc": 0.8842233999184672, + "train_speed(iter/s)": 0.207241 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.32813596725463867, + "learning_rate": 7.548083212959588e-05, + "loss": 0.2200457811355591, + "memory(GiB)": 194.68, + "step": 360, + "token_acc": 0.9080932784636488, + "train_speed(iter/s)": 0.207214 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.35305824875831604, + "eval_runtime": 1.8366, + "eval_samples_per_second": 2.178, + "eval_steps_per_second": 2.178, + "eval_token_acc": 0.7532467532467533, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.9034874439239502, + "learning_rate": 7.475841668524268e-05, + "loss": 0.28913445472717286, + "memory(GiB)": 194.68, + "step": 365, + "token_acc": 0.870794734275963, + "train_speed(iter/s)": 0.207182 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.37799835205078125, + "learning_rate": 7.402908775933419e-05, + "loss": 0.29883947372436526, + "memory(GiB)": 194.68, + "step": 370, + "token_acc": 0.8907584448693435, + "train_speed(iter/s)": 0.207123 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.5515828132629395, + "learning_rate": 7.329304900794991e-05, + "loss": 0.34903314113616946, + "memory(GiB)": 194.68, + "step": 375, + "token_acc": 0.8823183635081119, + "train_speed(iter/s)": 0.206817 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.7102660536766052, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3167442321777344, + "memory(GiB)": 194.68, + "step": 380, + "token_acc": 0.8950437317784257, + "train_speed(iter/s)": 0.206935 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.31274980306625366, + "eval_runtime": 1.8255, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 2.191, + "eval_token_acc": 0.7632367632367633, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.24322476983070374, + "learning_rate": 7.180166596385914e-05, + "loss": 0.33083691596984866, + "memory(GiB)": 194.68, + "step": 385, + "token_acc": 0.8845164609053497, + "train_speed(iter/s)": 0.206393 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.3991744816303253, + "learning_rate": 7.104673812141675e-05, + "loss": 0.23392996788024903, + "memory(GiB)": 194.68, + "step": 390, + "token_acc": 0.9032924310533349, + "train_speed(iter/s)": 0.206252 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.38407906889915466, + "learning_rate": 7.02859332377382e-05, + "loss": 0.19444363117218016, + "memory(GiB)": 194.68, + "step": 395, + "token_acc": 0.9170926872638364, + "train_speed(iter/s)": 0.206907 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.5186311602592468, + "learning_rate": 6.951946375817474e-05, + "loss": 0.17200204133987426, + "memory(GiB)": 194.68, + "step": 400, + "token_acc": 0.9564939219449776, + "train_speed(iter/s)": 0.207883 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.3031849265098572, + "eval_runtime": 1.7906, + "eval_samples_per_second": 2.234, + "eval_steps_per_second": 2.234, + "eval_token_acc": 0.7542457542457542, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.6981443166732788, + "learning_rate": 6.874754370984606e-05, + "loss": 0.12331185340881348, + "memory(GiB)": 194.68, + "step": 405, + "token_acc": 0.9200656994251301, + "train_speed(iter/s)": 0.207599 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.9956308603286743, + "learning_rate": 6.797038864187564e-05, + "loss": 0.12755507230758667, + "memory(GiB)": 194.68, + "step": 410, + "token_acc": 0.9337340775726349, + "train_speed(iter/s)": 0.207712 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.5786072611808777, + "learning_rate": 6.718821556520151e-05, + "loss": 0.12629324197769165, + "memory(GiB)": 194.68, + "step": 415, + "token_acc": 0.9433671220802116, + "train_speed(iter/s)": 0.208254 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.40014225244522095, + "learning_rate": 6.640124289197845e-05, + "loss": 0.08971643447875977, + "memory(GiB)": 194.68, + "step": 420, + "token_acc": 0.9842690534309737, + "train_speed(iter/s)": 0.208984 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.2971835434436798, + "eval_runtime": 1.8393, + "eval_samples_per_second": 2.175, + "eval_steps_per_second": 2.175, + "eval_token_acc": 0.7622377622377622, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.645484209060669, + "learning_rate": 6.560969037458933e-05, + "loss": 0.1467829942703247, + "memory(GiB)": 194.68, + "step": 425, + "token_acc": 0.9118694362017804, + "train_speed(iter/s)": 0.208889 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.3015040159225464, + "learning_rate": 6.481377904428171e-05, + "loss": 0.08001596331596375, + "memory(GiB)": 194.68, + "step": 430, + "token_acc": 0.9746255724718406, + "train_speed(iter/s)": 0.208643 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.3580942451953888, + "learning_rate": 6.401373114944781e-05, + "loss": 0.12085707187652588, + "memory(GiB)": 194.68, + "step": 435, + "token_acc": 0.9670248240088922, + "train_speed(iter/s)": 0.20794 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.8356326222419739, + "learning_rate": 6.320977009356431e-05, + "loss": 0.12868813276290894, + "memory(GiB)": 194.68, + "step": 440, + "token_acc": 0.9537296229211263, + "train_speed(iter/s)": 0.208265 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.3201369047164917, + "eval_runtime": 1.8482, + "eval_samples_per_second": 2.164, + "eval_steps_per_second": 2.164, + "eval_token_acc": 0.7542457542457542, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.3740726113319397, + "learning_rate": 6.240212037280966e-05, + "loss": 0.059886491298675536, + "memory(GiB)": 194.68, + "step": 445, + "token_acc": 0.9480653040236534, + "train_speed(iter/s)": 0.207886 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.8616650104522705, + "learning_rate": 6.159100751337642e-05, + "loss": 0.19066305160522462, + "memory(GiB)": 194.68, + "step": 450, + "token_acc": 0.9332333083270817, + "train_speed(iter/s)": 0.208264 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.34099313616752625, + "learning_rate": 6.077665800849568e-05, + "loss": 0.09232727885246277, + "memory(GiB)": 194.68, + "step": 455, + "token_acc": 0.9671393509680938, + "train_speed(iter/s)": 0.208355 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.35433682799339294, + "learning_rate": 5.99592992551918e-05, + "loss": 0.11596425771713256, + "memory(GiB)": 194.68, + "step": 460, + "token_acc": 0.9609312709296763, + "train_speed(iter/s)": 0.208578 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3214629888534546, + "eval_runtime": 1.8025, + "eval_samples_per_second": 2.219, + "eval_steps_per_second": 2.219, + "eval_token_acc": 0.7582417582417582, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.5267307758331299, + "learning_rate": 5.913915949078452e-05, + "loss": 0.11863926649093628, + "memory(GiB)": 194.68, + "step": 465, + "token_acc": 0.9122145401215168, + "train_speed(iter/s)": 0.208792 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.7042592167854309, + "learning_rate": 5.831646772915651e-05, + "loss": 0.07829801440238952, + "memory(GiB)": 194.68, + "step": 470, + "token_acc": 0.9680500284252416, + "train_speed(iter/s)": 0.208441 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 0.8526184558868408, + "learning_rate": 5.749145369680407e-05, + "loss": 0.0904280424118042, + "memory(GiB)": 194.68, + "step": 475, + "token_acc": 0.9607805987116332, + "train_speed(iter/s)": 0.208835 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.3466002345085144, + "learning_rate": 5.666434776868895e-05, + "loss": 0.17373031377792358, + "memory(GiB)": 194.68, + "step": 480, + "token_acc": 0.942286629033617, + "train_speed(iter/s)": 0.207896 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.3141739070415497, + "eval_runtime": 1.7942, + "eval_samples_per_second": 2.229, + "eval_steps_per_second": 2.229, + "eval_token_acc": 0.7582417582417582, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.9034782648086548, + "learning_rate": 5.583538090390882e-05, + "loss": 0.13746129274368285, + "memory(GiB)": 194.68, + "step": 485, + "token_acc": 0.9038551951104843, + "train_speed(iter/s)": 0.208273 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5720848441123962, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.1857938289642334, + "memory(GiB)": 194.68, + "step": 490, + "token_acc": 0.9280701754385965, + "train_speed(iter/s)": 0.208224 + }, + { + "epoch": 2.5, + "grad_norm": 0.35935673117637634, + "learning_rate": 5.41727907343245e-05, + "loss": 0.11083965301513672, + "memory(GiB)": 194.68, + "step": 495, + "token_acc": 0.9560761346998536, + "train_speed(iter/s)": 0.208702 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.547448992729187, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.1741043210029602, + "memory(GiB)": 194.68, + "step": 500, + "token_acc": 0.9156242021955578, + "train_speed(iter/s)": 0.208566 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.3121967911720276, + "eval_runtime": 1.8195, + "eval_samples_per_second": 2.198, + "eval_steps_per_second": 2.198, + "eval_token_acc": 0.7562437562437563, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.5953697562217712, + "learning_rate": 5.250554008935596e-05, + "loss": 0.09475700855255127, + "memory(GiB)": 194.68, + "step": 505, + "token_acc": 0.9292557111274871, + "train_speed(iter/s)": 0.208436 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.6027724146842957, + "learning_rate": 5.167074885038373e-05, + "loss": 0.10801750421524048, + "memory(GiB)": 194.68, + "step": 510, + "token_acc": 0.9673539518900344, + "train_speed(iter/s)": 0.208985 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.3738398849964142, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.1285596013069153, + "memory(GiB)": 194.68, + "step": 515, + "token_acc": 0.9517408906882591, + "train_speed(iter/s)": 0.209255 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.43995335698127747, + "learning_rate": 5e-05, + "loss": 0.13147177696228027, + "memory(GiB)": 194.68, + "step": 520, + "token_acc": 0.9425985953538628, + "train_speed(iter/s)": 0.209221 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.3112495243549347, + "eval_runtime": 1.8299, + "eval_samples_per_second": 2.186, + "eval_steps_per_second": 2.186, + "eval_token_acc": 0.7552447552447552, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.45797106623649597, + "learning_rate": 4.916450892453495e-05, + "loss": 0.08995423913002014, + "memory(GiB)": 194.68, + "step": 525, + "token_acc": 0.9309090909090909, + "train_speed(iter/s)": 0.209276 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3859306871891022, + "learning_rate": 4.832925114961629e-05, + "loss": 0.15920201539993287, + "memory(GiB)": 194.68, + "step": 530, + "token_acc": 0.9442068067695741, + "train_speed(iter/s)": 0.208661 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.9083346724510193, + "learning_rate": 4.749445991064404e-05, + "loss": 0.10588231086730956, + "memory(GiB)": 194.68, + "step": 535, + "token_acc": 0.965555432495293, + "train_speed(iter/s)": 0.208205 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.9049814343452454, + "learning_rate": 4.666036831274392e-05, + "loss": 0.21194179058074952, + "memory(GiB)": 194.68, + "step": 540, + "token_acc": 0.9197012138188608, + "train_speed(iter/s)": 0.207772 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.3101465106010437, + "eval_runtime": 1.805, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 2.216, + "eval_token_acc": 0.7552447552447552, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.5595670342445374, + "learning_rate": 4.582720926567552e-05, + "loss": 0.15268012285232543, + "memory(GiB)": 194.68, + "step": 545, + "token_acc": 0.9091122592766557, + "train_speed(iter/s)": 0.207444 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.2707825005054474, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.10676953792572022, + "memory(GiB)": 194.68, + "step": 550, + "token_acc": 0.9608655616942909, + "train_speed(iter/s)": 0.207501 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.5363113880157471, + "learning_rate": 4.416461909609119e-05, + "loss": 0.0965248167514801, + "memory(GiB)": 194.68, + "step": 555, + "token_acc": 0.9671724992257665, + "train_speed(iter/s)": 0.207753 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.32662245631217957, + "learning_rate": 4.333565223131107e-05, + "loss": 0.1195767879486084, + "memory(GiB)": 194.68, + "step": 560, + "token_acc": 0.9444240869671431, + "train_speed(iter/s)": 0.20762 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.30127066373825073, + "eval_runtime": 1.8237, + "eval_samples_per_second": 2.193, + "eval_steps_per_second": 2.193, + "eval_token_acc": 0.7572427572427572, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.6958869695663452, + "learning_rate": 4.250854630319593e-05, + "loss": 0.14267081022262573, + "memory(GiB)": 194.68, + "step": 565, + "token_acc": 0.9234961000106849, + "train_speed(iter/s)": 0.207194 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6756062507629395, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.19810810089111328, + "memory(GiB)": 194.68, + "step": 570, + "token_acc": 0.9246951219512195, + "train_speed(iter/s)": 0.206849 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.8443397283554077, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.1045069694519043, + "memory(GiB)": 194.68, + "step": 575, + "token_acc": 0.9572841133816744, + "train_speed(iter/s)": 0.206796 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 3.983555793762207, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.15350983142852784, + "memory(GiB)": 194.69, + "step": 580, + "token_acc": 0.9409324377716317, + "train_speed(iter/s)": 0.207187 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.3041973114013672, + "eval_runtime": 1.8171, + "eval_samples_per_second": 2.201, + "eval_steps_per_second": 2.201, + "eval_token_acc": 0.7532467532467533, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.7293962240219116, + "learning_rate": 3.922334199150432e-05, + "loss": 0.14068719148635864, + "memory(GiB)": 194.69, + "step": 585, + "token_acc": 0.9072411729503291, + "train_speed(iter/s)": 0.207381 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.6762995719909668, + "learning_rate": 3.840899248662358e-05, + "loss": 0.1358010172843933, + "memory(GiB)": 194.69, + "step": 590, + "token_acc": 0.9516318887105404, + "train_speed(iter/s)": 0.206957 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.22191222012043, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.12651437520980835, + "memory(GiB)": 194.69, + "step": 595, + "token_acc": 0.9631662269129287, + "train_speed(iter/s)": 0.206604 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.5518550276756287, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.05826301574707031, + "memory(GiB)": 194.69, + "step": 600, + "token_acc": 0.9831535392345204, + "train_speed(iter/s)": 0.206751 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.3032275140285492, + "eval_runtime": 1.788, + "eval_samples_per_second": 2.237, + "eval_steps_per_second": 2.237, + "eval_token_acc": 0.7522477522477522, + "step": 600 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.6190295219421387, + "learning_rate": 3.598626885055219e-05, + "loss": 0.0395317792892456, + "memory(GiB)": 194.69, + "step": 605, + "token_acc": 0.9590757783434501, + "train_speed(iter/s)": 0.206057 + }, + { + "epoch": 3.080808080808081, + "grad_norm": 0.25448670983314514, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.021629127860069274, + "memory(GiB)": 194.69, + "step": 610, + "token_acc": 0.9906427990235964, + "train_speed(iter/s)": 0.206434 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.06814919412136078, + "learning_rate": 3.4390309625410686e-05, + "loss": 0.012175245583057404, + "memory(GiB)": 194.69, + "step": 615, + "token_acc": 0.9950428120775124, + "train_speed(iter/s)": 0.206853 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 0.41807010769844055, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.03373536169528961, + "memory(GiB)": 194.69, + "step": 620, + "token_acc": 0.9886826618379357, + "train_speed(iter/s)": 0.206723 + }, + { + "epoch": 3.1313131313131315, + "eval_loss": 0.31177499890327454, + "eval_runtime": 1.8259, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 2.191, + "eval_token_acc": 0.7552447552447552, + "step": 620 + }, + { + "epoch": 3.1565656565656566, + "grad_norm": 0.4315682351589203, + "learning_rate": 3.281178443479852e-05, + "loss": 0.03276803493499756, + "memory(GiB)": 194.69, + "step": 625, + "token_acc": 0.9653735489631908, + "train_speed(iter/s)": 0.206122 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.3409155309200287, + "learning_rate": 3.202961135812437e-05, + "loss": 0.03847094178199768, + "memory(GiB)": 194.69, + "step": 630, + "token_acc": 0.9868200836820084, + "train_speed(iter/s)": 0.206497 + }, + { + "epoch": 3.207070707070707, + "grad_norm": 0.822550356388092, + "learning_rate": 3.1252456290153954e-05, + "loss": 0.07642163634300232, + "memory(GiB)": 194.69, + "step": 635, + "token_acc": 0.9701730418943534, + "train_speed(iter/s)": 0.206299 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.39569079875946045, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.06662976741790771, + "memory(GiB)": 194.69, + "step": 640, + "token_acc": 0.9696140693698094, + "train_speed(iter/s)": 0.205891 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.3040866553783417, + "eval_runtime": 1.796, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 2.227, + "eval_token_acc": 0.7552447552447552, + "step": 640 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.5110739469528198, + "learning_rate": 2.9714066762261823e-05, + "loss": 0.055225080251693724, + "memory(GiB)": 194.69, + "step": 645, + "token_acc": 0.9549541529422239, + "train_speed(iter/s)": 0.20523 + }, + { + "epoch": 3.282828282828283, + "grad_norm": 0.30346569418907166, + "learning_rate": 2.895326187858326e-05, + "loss": 0.05822249054908753, + "memory(GiB)": 194.69, + "step": 650, + "token_acc": 0.9828406388039416, + "train_speed(iter/s)": 0.205468 + }, + { + "epoch": 3.308080808080808, + "grad_norm": 0.39017254114151, + "learning_rate": 2.8198334036140874e-05, + "loss": 0.026796561479568482, + "memory(GiB)": 194.69, + "step": 655, + "token_acc": 0.992970946579194, + "train_speed(iter/s)": 0.205582 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.2839377522468567, + "learning_rate": 2.74494940391949e-05, + "loss": 0.045568430423736574, + "memory(GiB)": 194.69, + "step": 660, + "token_acc": 0.9833729216152018, + "train_speed(iter/s)": 0.2055 + }, + { + "epoch": 3.3333333333333335, + "eval_loss": 0.3058268129825592, + "eval_runtime": 1.7988, + "eval_samples_per_second": 2.224, + "eval_steps_per_second": 2.224, + "eval_token_acc": 0.7562437562437563, + "step": 660 + }, + { + "epoch": 3.3585858585858586, + "grad_norm": 0.7259745597839355, + "learning_rate": 2.6706950992050094e-05, + "loss": 0.04427232146263123, + "memory(GiB)": 194.69, + "step": 665, + "token_acc": 0.9405374499714122, + "train_speed(iter/s)": 0.205656 + }, + { + "epoch": 3.3838383838383836, + "grad_norm": 0.6911581754684448, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.044158649444580075, + "memory(GiB)": 194.69, + "step": 670, + "token_acc": 0.9826169405815424, + "train_speed(iter/s)": 0.205791 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.36632707715034485, + "learning_rate": 2.5241583314757327e-05, + "loss": 0.038330867886543274, + "memory(GiB)": 194.69, + "step": 675, + "token_acc": 0.9843804843804844, + "train_speed(iter/s)": 0.205991 + }, + { + "epoch": 3.4343434343434343, + "grad_norm": 0.4337018132209778, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.05873996019363403, + "memory(GiB)": 194.69, + "step": 680, + "token_acc": 0.974401170232218, + "train_speed(iter/s)": 0.206296 + }, + { + "epoch": 3.4343434343434343, + "eval_loss": 0.3051184415817261, + "eval_runtime": 1.8159, + "eval_samples_per_second": 2.203, + "eval_steps_per_second": 2.203, + "eval_token_acc": 0.7512487512487512, + "step": 680 + }, + { + "epoch": 3.45959595959596, + "grad_norm": 0.38646677136421204, + "learning_rate": 2.3803867633181574e-05, + "loss": 0.07444831728935242, + "memory(GiB)": 194.69, + "step": 685, + "token_acc": 0.9341205032001766, + "train_speed(iter/s)": 0.205939 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.934688925743103, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.05726301074028015, + "memory(GiB)": 194.69, + "step": 690, + "token_acc": 0.9722510074841682, + "train_speed(iter/s)": 0.205757 + }, + { + "epoch": 3.51010101010101, + "grad_norm": 0.5144165754318237, + "learning_rate": 2.2395409692487175e-05, + "loss": 0.059635859727859494, + "memory(GiB)": 194.69, + "step": 695, + "token_acc": 0.9783371472158657, + "train_speed(iter/s)": 0.205924 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.23240762948989868, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.009928755462169647, + "memory(GiB)": 194.69, + "step": 700, + "token_acc": 0.9956945388624519, + "train_speed(iter/s)": 0.206296 + }, + { + "epoch": 3.5353535353535355, + "eval_loss": 0.30824291706085205, + "eval_runtime": 1.8346, + "eval_samples_per_second": 2.18, + "eval_steps_per_second": 2.18, + "eval_token_acc": 0.7482517482517482, + "step": 700 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.35060033202171326, + "learning_rate": 2.1017782560671123e-05, + "loss": 0.036235207319259645, + "memory(GiB)": 194.69, + "step": 705, + "token_acc": 0.9407323518308796, + "train_speed(iter/s)": 0.206441 + }, + { + "epoch": 3.5858585858585856, + "grad_norm": 0.4275978207588196, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.031028282642364503, + "memory(GiB)": 194.69, + "step": 710, + "token_acc": 0.9893070552468812, + "train_speed(iter/s)": 0.206201 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.2053557187318802, + "learning_rate": 1.967252487164663e-05, + "loss": 0.002437155693769455, + "memory(GiB)": 194.69, + "step": 715, + "token_acc": 0.9986950848194868, + "train_speed(iter/s)": 0.206895 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.7973630428314209, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.01395910233259201, + "memory(GiB)": 194.69, + "step": 720, + "token_acc": 0.9963244012331041, + "train_speed(iter/s)": 0.206763 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.31287190318107605, + "eval_runtime": 1.8131, + "eval_samples_per_second": 2.206, + "eval_steps_per_second": 2.206, + "eval_token_acc": 0.7492507492507493, + "step": 720 + }, + { + "epoch": 3.6616161616161618, + "grad_norm": 0.45141226053237915, + "learning_rate": 1.836113910678507e-05, + "loss": 0.03765462040901184, + "memory(GiB)": 194.69, + "step": 725, + "token_acc": 0.9560504300695181, + "train_speed(iter/s)": 0.206518 + }, + { + "epoch": 3.686868686868687, + "grad_norm": 0.2863880693912506, + "learning_rate": 1.771860742205988e-05, + "loss": 0.05053359270095825, + "memory(GiB)": 194.69, + "step": 730, + "token_acc": 0.9809334657398212, + "train_speed(iter/s)": 0.206217 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.37708166241645813, + "learning_rate": 1.7085089916835923e-05, + "loss": 0.030423933267593385, + "memory(GiB)": 194.69, + "step": 735, + "token_acc": 0.9856907686463214, + "train_speed(iter/s)": 0.206105 + }, + { + "epoch": 3.7373737373737375, + "grad_norm": 0.5678088068962097, + "learning_rate": 1.646076349303884e-05, + "loss": 0.04238354861736297, + "memory(GiB)": 194.69, + "step": 740, + "token_acc": 0.9868162140889414, + "train_speed(iter/s)": 0.206402 + }, + { + "epoch": 3.7373737373737375, + "eval_loss": 0.3119312822818756, + "eval_runtime": 1.8083, + "eval_samples_per_second": 2.212, + "eval_steps_per_second": 2.212, + "eval_token_acc": 0.7562437562437563, + "step": 740 + }, + { + "epoch": 3.7626262626262625, + "grad_norm": 0.3441520035266876, + "learning_rate": 1.584580248609846e-05, + "loss": 0.034655985236167905, + "memory(GiB)": 194.69, + "step": 745, + "token_acc": 0.9208163265306123, + "train_speed(iter/s)": 0.206715 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.7808462381362915, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.03131322860717774, + "memory(GiB)": 194.69, + "step": 750, + "token_acc": 0.9910037878787878, + "train_speed(iter/s)": 0.207112 + }, + { + "epoch": 3.813131313131313, + "grad_norm": 0.4915235638618469, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.10859463214874268, + "memory(GiB)": 194.69, + "step": 755, + "token_acc": 0.9532800955295171, + "train_speed(iter/s)": 0.206346 + }, + { + "epoch": 3.8383838383838382, + "grad_norm": 0.36231735348701477, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.01086568683385849, + "memory(GiB)": 194.69, + "step": 760, + "token_acc": 0.997327632282202, + "train_speed(iter/s)": 0.206831 + }, + { + "epoch": 3.8383838383838382, + "eval_loss": 0.30831077694892883, + "eval_runtime": 1.7998, + "eval_samples_per_second": 2.222, + "eval_steps_per_second": 2.222, + "eval_token_acc": 0.7542457542457542, + "step": 760 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.4630831182003021, + "learning_rate": 1.3483006802566544e-05, + "loss": 0.055282962322235105, + "memory(GiB)": 194.69, + "step": 765, + "token_acc": 0.9400766283524904, + "train_speed(iter/s)": 0.206812 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.10027548670768738, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.028864413499832153, + "memory(GiB)": 194.69, + "step": 770, + "token_acc": 0.989685597116938, + "train_speed(iter/s)": 0.206816 + }, + { + "epoch": 3.9141414141414144, + "grad_norm": 0.4784111976623535, + "learning_rate": 1.2362137491387432e-05, + "loss": 0.03930847942829132, + "memory(GiB)": 194.69, + "step": 775, + "token_acc": 0.9889024019458802, + "train_speed(iter/s)": 0.206798 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.4522968530654907, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.04449716806411743, + "memory(GiB)": 194.69, + "step": 780, + "token_acc": 0.9804500703234881, + "train_speed(iter/s)": 0.206775 + }, + { + "epoch": 3.9393939393939394, + "eval_loss": 0.3084229528903961, + "eval_runtime": 1.8372, + "eval_samples_per_second": 2.177, + "eval_steps_per_second": 2.177, + "eval_token_acc": 0.7532467532467533, + "step": 780 + }, + { + "epoch": 3.9646464646464645, + "grad_norm": 1.1634060144424438, + "learning_rate": 1.1283304875289336e-05, + "loss": 0.03509677648544311, + "memory(GiB)": 194.69, + "step": 785, + "token_acc": 0.9363057324840764, + "train_speed(iter/s)": 0.206843 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 0.5749446749687195, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.04887769818305969, + "memory(GiB)": 194.69, + "step": 790, + "token_acc": 0.9847414070028911, + "train_speed(iter/s)": 0.207005 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.2582509219646454, + "learning_rate": 1.024771387279585e-05, + "loss": 0.02856588065624237, + "memory(GiB)": 194.69, + "step": 795, + "token_acc": 0.9900847593155285, + "train_speed(iter/s)": 0.20717 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 0.3403402268886566, + "learning_rate": 9.746497343621857e-06, + "loss": 0.00884537547826767, + "memory(GiB)": 194.69, + "step": 800, + "token_acc": 0.9983554885569412, + "train_speed(iter/s)": 0.207207 + }, + { + "epoch": 4.040404040404041, + "eval_loss": 0.30910566449165344, + "eval_runtime": 1.7909, + "eval_samples_per_second": 2.234, + "eval_steps_per_second": 2.234, + "eval_token_acc": 0.7492507492507493, + "step": 800 + }, + { + "epoch": 4.065656565656566, + "grad_norm": 0.3678455650806427, + "learning_rate": 9.256521107059834e-06, + "loss": 0.01580266058444977, + "memory(GiB)": 194.69, + "step": 805, + "token_acc": 0.9634023220595659, + "train_speed(iter/s)": 0.206982 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.02392689324915409, + "learning_rate": 8.777921982911996e-06, + "loss": 0.002520921640098095, + "memory(GiB)": 194.69, + "step": 810, + "token_acc": 0.9998035749361619, + "train_speed(iter/s)": 0.207283 + }, + { + "epoch": 4.116161616161616, + "grad_norm": 0.29595932364463806, + "learning_rate": 8.310833614062651e-06, + "loss": 0.01393090933561325, + "memory(GiB)": 194.69, + "step": 815, + "token_acc": 0.9945280437756497, + "train_speed(iter/s)": 0.207408 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.033545322716236115, + "learning_rate": 7.85538642916015e-06, + "loss": 0.007520098239183426, + "memory(GiB)": 194.69, + "step": 820, + "token_acc": 0.9984389634717452, + "train_speed(iter/s)": 0.207412 + }, + { + "epoch": 4.141414141414141, + "eval_loss": 0.312429279088974, + "eval_runtime": 1.8138, + "eval_samples_per_second": 2.205, + "eval_steps_per_second": 2.205, + "eval_token_acc": 0.7492507492507493, + "step": 820 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.2401837557554245, + "learning_rate": 7.4117076061961885e-06, + "loss": 0.020593562722206117, + "memory(GiB)": 194.69, + "step": 825, + "token_acc": 0.9678671554884624, + "train_speed(iter/s)": 0.207046 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 0.3309480845928192, + "learning_rate": 6.979921036993042e-06, + "loss": 0.051178747415542604, + "memory(GiB)": 194.69, + "step": 830, + "token_acc": 0.9609134826526131, + "train_speed(iter/s)": 0.207063 + }, + { + "epoch": 4.217171717171717, + "grad_norm": 0.37649092078208923, + "learning_rate": 6.5601472926081766e-06, + "loss": 0.0256537526845932, + "memory(GiB)": 194.69, + "step": 835, + "token_acc": 0.9917600102999872, + "train_speed(iter/s)": 0.207066 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.34800857305526733, + "learning_rate": 6.152503589666425e-06, + "loss": 0.02737319767475128, + "memory(GiB)": 194.69, + "step": 840, + "token_acc": 0.9857651245551602, + "train_speed(iter/s)": 0.206927 + }, + { + "epoch": 4.242424242424242, + "eval_loss": 0.3184429109096527, + "eval_runtime": 1.7906, + "eval_samples_per_second": 2.234, + "eval_steps_per_second": 2.234, + "eval_token_acc": 0.7482517482517482, + "step": 840 + }, + { + "epoch": 4.267676767676767, + "grad_norm": 0.7228756546974182, + "learning_rate": 5.757103757628573e-06, + "loss": 0.06719566583633423, + "memory(GiB)": 194.69, + "step": 845, + "token_acc": 0.9502145922746781, + "train_speed(iter/s)": 0.206416 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 0.38411420583724976, + "learning_rate": 5.374058207005944e-06, + "loss": 0.04835307598114014, + "memory(GiB)": 194.69, + "step": 850, + "token_acc": 0.981879262954291, + "train_speed(iter/s)": 0.206169 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.1029912531375885, + "learning_rate": 5.0034738985296095e-06, + "loss": 0.012648317217826843, + "memory(GiB)": 194.69, + "step": 855, + "token_acc": 0.9956322341122515, + "train_speed(iter/s)": 0.20603 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 0.24367812275886536, + "learning_rate": 4.645454313282965e-06, + "loss": 0.012134979665279388, + "memory(GiB)": 194.69, + "step": 860, + "token_acc": 0.9967785234899329, + "train_speed(iter/s)": 0.205974 + }, + { + "epoch": 4.343434343434343, + "eval_loss": 0.32311469316482544, + "eval_runtime": 1.7858, + "eval_samples_per_second": 2.24, + "eval_steps_per_second": 2.24, + "eval_token_acc": 0.7492507492507493, + "step": 860 + }, + { + "epoch": 4.3686868686868685, + "grad_norm": 0.18834778666496277, + "learning_rate": 4.3000994238058644e-06, + "loss": 0.010147520154714585, + "memory(GiB)": 194.69, + "step": 865, + "token_acc": 0.9645875251509054, + "train_speed(iter/s)": 0.20586 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.1570287048816681, + "learning_rate": 3.967505666178556e-06, + "loss": 0.008408330380916595, + "memory(GiB)": 194.69, + "step": 870, + "token_acc": 0.9981830194912454, + "train_speed(iter/s)": 0.206013 + }, + { + "epoch": 4.41919191919192, + "grad_norm": 0.15527097880840302, + "learning_rate": 3.647765913093132e-06, + "loss": 0.006087615713477134, + "memory(GiB)": 194.69, + "step": 875, + "token_acc": 0.996905820924386, + "train_speed(iter/s)": 0.206279 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.49585437774658203, + "learning_rate": 3.340969447919873e-06, + "loss": 0.01747373789548874, + "memory(GiB)": 194.69, + "step": 880, + "token_acc": 0.9946977730646872, + "train_speed(iter/s)": 0.206368 + }, + { + "epoch": 4.444444444444445, + "eval_loss": 0.32524874806404114, + "eval_runtime": 1.8805, + "eval_samples_per_second": 2.127, + "eval_steps_per_second": 2.127, + "eval_token_acc": 0.7482517482517482, + "step": 880 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.2330419272184372, + "learning_rate": 3.0472019397761064e-06, + "loss": 0.00792773962020874, + "memory(GiB)": 194.69, + "step": 885, + "token_acc": 0.9659295448701466, + "train_speed(iter/s)": 0.206247 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 0.13684527575969696, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.011314756423234939, + "memory(GiB)": 194.69, + "step": 890, + "token_acc": 0.9967257844474762, + "train_speed(iter/s)": 0.206195 + }, + { + "epoch": 4.52020202020202, + "grad_norm": 0.3359576463699341, + "learning_rate": 2.4990782572647975e-06, + "loss": 0.021187099814414977, + "memory(GiB)": 194.69, + "step": 895, + "token_acc": 0.9942469693856585, + "train_speed(iter/s)": 0.206448 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.28014275431632996, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.03609364330768585, + "memory(GiB)": 194.69, + "step": 900, + "token_acc": 0.9888475836431226, + "train_speed(iter/s)": 0.206265 + }, + { + "epoch": 4.545454545454545, + "eval_loss": 0.32550811767578125, + "eval_runtime": 1.8451, + "eval_samples_per_second": 2.168, + "eval_steps_per_second": 2.168, + "eval_token_acc": 0.7472527472527473, + "step": 900 + }, + { + "epoch": 4.570707070707071, + "grad_norm": 1.3766915798187256, + "learning_rate": 2.004007049848461e-06, + "loss": 0.011403033882379532, + "memory(GiB)": 194.69, + "step": 905, + "token_acc": 0.9590459752079084, + "train_speed(iter/s)": 0.206299 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 0.06532011181116104, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.006288837641477585, + "memory(GiB)": 194.69, + "step": 910, + "token_acc": 0.9984427718660783, + "train_speed(iter/s)": 0.206688 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.021015260368585587, + "learning_rate": 1.5625412489637337e-06, + "loss": 0.017497456073760985, + "memory(GiB)": 194.69, + "step": 915, + "token_acc": 0.9928386408654579, + "train_speed(iter/s)": 0.206791 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 0.11464820802211761, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.004374519735574722, + "memory(GiB)": 194.69, + "step": 920, + "token_acc": 0.9980553154710458, + "train_speed(iter/s)": 0.207068 + }, + { + "epoch": 4.646464646464646, + "eval_loss": 0.32665225863456726, + "eval_runtime": 1.808, + "eval_samples_per_second": 2.212, + "eval_steps_per_second": 2.212, + "eval_token_acc": 0.7492507492507493, + "step": 920 + }, + { + "epoch": 4.671717171717171, + "grad_norm": 0.42058226466178894, + "learning_rate": 1.1751739156407649e-06, + "loss": 0.003269971534609795, + "memory(GiB)": 194.69, + "step": 925, + "token_acc": 0.9575136386179534, + "train_speed(iter/s)": 0.207081 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.008930183947086334, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.015348337590694427, + "memory(GiB)": 194.69, + "step": 930, + "token_acc": 0.9930389817024662, + "train_speed(iter/s)": 0.207309 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.42432549595832825, + "learning_rate": 8.423376898168245e-07, + "loss": 0.021287491917610167, + "memory(GiB)": 194.69, + "step": 935, + "token_acc": 0.9904045899693342, + "train_speed(iter/s)": 0.207003 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 0.09640122205018997, + "learning_rate": 6.964873004985717e-07, + "loss": 0.015300212800502777, + "memory(GiB)": 194.69, + "step": 940, + "token_acc": 0.9938466655579578, + "train_speed(iter/s)": 0.207138 + }, + { + "epoch": 4.747474747474747, + "eval_loss": 0.3267664313316345, + "eval_runtime": 1.8049, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 2.216, + "eval_token_acc": 0.7502497502497503, + "step": 940 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.09156661480665207, + "learning_rate": 5.644043071326932e-07, + "loss": 0.012129424512386322, + "memory(GiB)": 194.69, + "step": 945, + "token_acc": 0.9729514717581543, + "train_speed(iter/s)": 0.206551 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 0.014584671705961227, + "learning_rate": 4.461255922609986e-07, + "loss": 0.009703928232192993, + "memory(GiB)": 194.69, + "step": 950, + "token_acc": 0.9948709729123257, + "train_speed(iter/s)": 0.206615 + }, + { + "epoch": 4.8232323232323235, + "grad_norm": 0.09790431708097458, + "learning_rate": 3.416841837512952e-07, + "loss": 0.0038586195558309557, + "memory(GiB)": 194.69, + "step": 955, + "token_acc": 0.9988066825775657, + "train_speed(iter/s)": 0.206618 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.36749953031539917, + "learning_rate": 2.511092455747932e-07, + "loss": 0.023634913563728332, + "memory(GiB)": 194.69, + "step": 960, + "token_acc": 0.9918020343100046, + "train_speed(iter/s)": 0.206703 + }, + { + "epoch": 4.848484848484849, + "eval_loss": 0.327303946018219, + "eval_runtime": 1.8393, + "eval_samples_per_second": 2.175, + "eval_steps_per_second": 2.175, + "eval_token_acc": 0.7482517482517482, + "step": 960 + }, + { + "epoch": 4.873737373737374, + "grad_norm": 0.21629047393798828, + "learning_rate": 1.7442606966242004e-07, + "loss": 0.014398331940174102, + "memory(GiB)": 194.69, + "step": 965, + "token_acc": 0.9552456623966272, + "train_speed(iter/s)": 0.206714 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 0.2632206678390503, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.00892629474401474, + "memory(GiB)": 194.69, + "step": 970, + "token_acc": 0.9974954082484555, + "train_speed(iter/s)": 0.206903 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.14011166989803314, + "learning_rate": 6.281677086071303e-08, + "loss": 0.006720826029777527, + "memory(GiB)": 194.69, + "step": 975, + "token_acc": 0.9985783915515841, + "train_speed(iter/s)": 0.207185 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 0.07906040549278259, + "learning_rate": 2.792181348726941e-08, + "loss": 0.03538868129253388, + "memory(GiB)": 194.69, + "step": 980, + "token_acc": 0.9834126862233143, + "train_speed(iter/s)": 0.207226 + }, + { + "epoch": 4.94949494949495, + "eval_loss": 0.3269588053226471, + "eval_runtime": 1.7898, + "eval_samples_per_second": 2.235, + "eval_steps_per_second": 2.235, + "eval_token_acc": 0.7472527472527473, + "step": 980 + }, + { + "epoch": 4.974747474747475, + "grad_norm": 0.30053889751434326, + "learning_rate": 6.980940707146389e-09, + "loss": 0.06200398206710815, + "memory(GiB)": 194.69, + "step": 985, + "token_acc": 0.9508980763099161, + "train_speed(iter/s)": 0.206925 + }, + { + "epoch": 5.0, + "grad_norm": 0.19153615832328796, + "learning_rate": 0.0, + "loss": 0.004158956184983254, + "memory(GiB)": 194.69, + "step": 990, + "token_acc": 0.999026921829387, + "train_speed(iter/s)": 0.207036 + }, + { + "epoch": 5.0, + "eval_loss": 0.32688042521476746, + "eval_runtime": 1.8656, + "eval_samples_per_second": 2.144, + "eval_steps_per_second": 2.144, + "eval_token_acc": 0.7472527472527473, + "step": 990 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.384415932845261e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/training_args.bin b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..84723354b7c97cc0656161a46b521532fa73cd32 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ad8b7f2148494eb493ac7d940a063889b4311c7edf4791f9b3ad8178a343be +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1ee83bb64e07e1cef42acabc25983a2a31612496 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..9a6c94bf5452f39377654249c673b7b1bd192150 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..be988800346dac6f3e782b4ea38686b081278850 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..e24dcc548268eb7176d5e431c3d0fb2b0dd96373 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..a2791803a5eee60b8ee8504c5c31715a85ba97a2 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..28af4713ddf3ee940ab6789f15d73de984b4c8ea Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..eb469e4ed17a7d514088da4981a34d09aaf8c035 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..8977f64dd4afd86ccf4d0ec848be41268d0e90fc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..7e64cb056e7d132926880f52ded1b213d2d53152 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..17973b6a5788d4bcf6baf4e5bca4d8b47663bfbf Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..bedb6f2796c03218163804f6cec8f9fe04eda3af Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..938edf92a8e0d9ca3d45bba3f0bc351cd48f52e8 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..e5db2d69729b18fcc90f9f186cce362095d95db3 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..1fd27dce69e3354bdf3bf96fbadec5ffbf43d87d Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..353f1f87809d86fd03ca055b6041d9c864953081 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..608216757078777241a4892a177a6552100e8dc6 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..6126e9a9019eaf07f06f69d5d975709c78676a9f Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/logging.jsonl b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44159691659110f1613c3a21dcc2ddd549a00dbf --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/logging.jsonl @@ -0,0 +1,251 @@ +{"loss": 0.56973904, "token_acc": 0.85137615, "grad_norm": 0.51298577, "learning_rate": 2e-06, "memory(GiB)": 144.03, "train_speed(iter/s)": 0.135022, "epoch": 0.00505051, "global_step/max_steps": "1/990", "percentage": "0.10%", "elapsed_time": "7s", "remaining_time": "1h 56m 7s"} +{"loss": 0.74687946, "token_acc": 0.82976132, "grad_norm": 0.86910707, "learning_rate": 1e-05, "memory(GiB)": 153.42, "train_speed(iter/s)": 0.212168, "epoch": 0.02525253, "global_step/max_steps": "5/990", "percentage": "0.51%", "elapsed_time": "23s", "remaining_time": "1h 16m 11s"} +{"loss": 0.79461751, "token_acc": 0.78732007, "grad_norm": 0.59658676, "learning_rate": 2e-05, "memory(GiB)": 160.41, "train_speed(iter/s)": 0.221849, "epoch": 0.05050505, "global_step/max_steps": "10/990", "percentage": "1.01%", "elapsed_time": "44s", "remaining_time": "1h 13m 2s"} +{"loss": 0.70086827, "token_acc": 0.80169447, "grad_norm": 0.54861104, "learning_rate": 3e-05, "memory(GiB)": 170.14, "train_speed(iter/s)": 0.218317, "epoch": 0.07575758, "global_step/max_steps": "15/990", "percentage": "1.52%", "elapsed_time": "1m 8s", "remaining_time": "1h 14m 2s"} +{"loss": 0.5246501, "token_acc": 0.87064677, "grad_norm": 4.01875162, "learning_rate": 4e-05, "memory(GiB)": 170.14, "train_speed(iter/s)": 0.236511, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "1m 24s", "remaining_time": "1h 8m 3s"} +{"eval_loss": 0.72908872, "eval_token_acc": 0.72127872, "eval_runtime": 1.8127, "eval_samples_per_second": 2.207, "eval_steps_per_second": 2.207, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "1m 26s", "remaining_time": "1h 9m 31s"} +{"loss": 0.48354249, "token_acc": 0.8261477, "grad_norm": 0.4592025, "learning_rate": 5e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.214846, "epoch": 0.12626263, "global_step/max_steps": "25/990", "percentage": "2.53%", "elapsed_time": "1m 56s", "remaining_time": "1h 14m 37s"} +{"loss": 0.57038975, "token_acc": 0.81141543, "grad_norm": 1.14572322, "learning_rate": 6e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.223062, "epoch": 0.15151515, "global_step/max_steps": "30/990", "percentage": "3.03%", "elapsed_time": "2m 14s", "remaining_time": "1h 11m 32s"} +{"loss": 0.37099953, "token_acc": 0.85117541, "grad_norm": 0.32192853, "learning_rate": 7e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.226934, "epoch": 0.17676768, "global_step/max_steps": "35/990", "percentage": "3.54%", "elapsed_time": "2m 33s", "remaining_time": "1h 9m 58s"} +{"loss": 0.44092236, "token_acc": 0.84801604, "grad_norm": 0.29303941, "learning_rate": 8e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.226914, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "2m 55s", "remaining_time": "1h 9m 38s"} +{"eval_loss": 0.53985226, "eval_token_acc": 0.73626374, "eval_runtime": 1.8136, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "2m 57s", "remaining_time": "1h 10m 21s"} +{"loss": 0.44204354, "token_acc": 0.83846547, "grad_norm": 0.44231713, "learning_rate": 9e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.218055, "epoch": 0.22727273, "global_step/max_steps": "45/990", "percentage": "4.55%", "elapsed_time": "3m 26s", "remaining_time": "1h 12m 6s"} +{"loss": 0.52473979, "token_acc": 0.86844123, "grad_norm": 0.35660705, "learning_rate": 0.0001, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.213301, "epoch": 0.25252525, "global_step/max_steps": "50/990", "percentage": "5.05%", "elapsed_time": "3m 54s", "remaining_time": "1h 13m 20s"} +{"loss": 0.46730862, "token_acc": 0.84726821, "grad_norm": 0.41687265, "learning_rate": 9.999e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.21328, "epoch": 0.27777778, "global_step/max_steps": "55/990", "percentage": "5.56%", "elapsed_time": "4m 17s", "remaining_time": "1h 12m 57s"} +{"loss": 0.36838181, "token_acc": 0.85517241, "grad_norm": 0.71482617, "learning_rate": 9.997e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.218719, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "4m 33s", "remaining_time": "1h 10m 46s"} +{"eval_loss": 0.51275671, "eval_token_acc": 0.74225774, "eval_runtime": 1.8179, "eval_samples_per_second": 2.2, "eval_steps_per_second": 2.2, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "4m 35s", "remaining_time": "1h 11m 14s"} +{"loss": 0.54117289, "token_acc": 0.82982857, "grad_norm": 0.65702057, "learning_rate": 9.994e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.213853, "epoch": 0.32828283, "global_step/max_steps": "65/990", "percentage": "6.57%", "elapsed_time": "5m 3s", "remaining_time": "1h 12m 0s"} +{"loss": 0.39851596, "token_acc": 0.87324493, "grad_norm": 0.29019824, "learning_rate": 9.989e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.211799, "epoch": 0.35353535, "global_step/max_steps": "70/990", "percentage": "7.07%", "elapsed_time": "5m 30s", "remaining_time": "1h 12m 18s"} +{"loss": 0.56977282, "token_acc": 0.83548983, "grad_norm": 0.31978855, "learning_rate": 9.983e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.215997, "epoch": 0.37878788, "global_step/max_steps": "75/990", "percentage": "7.58%", "elapsed_time": "5m 46s", "remaining_time": "1h 10m 31s"} +{"loss": 0.60112586, "token_acc": 0.83819351, "grad_norm": 0.39013043, "learning_rate": 9.975e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.216474, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "6m 9s", "remaining_time": "1h 9m 59s"} +{"eval_loss": 0.52264911, "eval_token_acc": 0.73926074, "eval_runtime": 1.8268, "eval_samples_per_second": 2.19, "eval_steps_per_second": 2.19, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "6m 11s", "remaining_time": "1h 10m 20s"} +{"loss": 0.54490166, "token_acc": 0.8128, "grad_norm": 0.27575281, "learning_rate": 9.966e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.213104, "epoch": 0.42929293, "global_step/max_steps": "85/990", "percentage": "8.59%", "elapsed_time": "6m 38s", "remaining_time": "1h 10m 42s"} +{"loss": 0.43974614, "token_acc": 0.84726576, "grad_norm": 0.63812447, "learning_rate": 9.955e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.214392, "epoch": 0.45454545, "global_step/max_steps": "90/990", "percentage": "9.09%", "elapsed_time": "6m 59s", "remaining_time": "1h 9m 54s"} +{"loss": 0.353268, "token_acc": 0.86625734, "grad_norm": 0.33711073, "learning_rate": 9.944e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.21374, "epoch": 0.47979798, "global_step/max_steps": "95/990", "percentage": "9.60%", "elapsed_time": "7m 24s", "remaining_time": "1h 9m 43s"} +{"loss": 0.41753764, "token_acc": 0.86404697, "grad_norm": 0.31318393, "learning_rate": 9.93e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.211078, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "7m 53s", "remaining_time": "1h 10m 13s"} +{"eval_loss": 0.51022243, "eval_token_acc": 0.73826174, "eval_runtime": 1.8327, "eval_samples_per_second": 2.183, "eval_steps_per_second": 2.183, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "7m 55s", "remaining_time": "1h 10m 29s"} +{"loss": 0.54590735, "token_acc": 0.80391121, "grad_norm": 0.39832383, "learning_rate": 9.916e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.210332, "epoch": 0.53030303, "global_step/max_steps": "105/990", "percentage": "10.61%", "elapsed_time": "8m 18s", "remaining_time": "1h 10m 4s"} +{"loss": 0.51217399, "token_acc": 0.80972818, "grad_norm": 1.30077887, "learning_rate": 9.9e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.214826, "epoch": 0.55555556, "global_step/max_steps": "110/990", "percentage": "11.11%", "elapsed_time": "8m 31s", "remaining_time": "1h 8m 13s"} +{"loss": 0.45124936, "token_acc": 0.86146286, "grad_norm": 0.34517738, "learning_rate": 9.882e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.211594, "epoch": 0.58080808, "global_step/max_steps": "115/990", "percentage": "11.62%", "elapsed_time": "9m 3s", "remaining_time": "1h 8m 52s"} +{"loss": 0.50853257, "token_acc": 0.81305748, "grad_norm": 0.7278887, "learning_rate": 9.864e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.21067, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "9m 29s", "remaining_time": "1h 8m 47s"} +{"eval_loss": 0.50906134, "eval_token_acc": 0.73826174, "eval_runtime": 1.7896, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "9m 31s", "remaining_time": "1h 9m 0s"} +{"loss": 0.46891389, "token_acc": 0.83522818, "grad_norm": 0.3801716, "learning_rate": 9.844e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.210631, "epoch": 0.63131313, "global_step/max_steps": "125/990", "percentage": "12.63%", "elapsed_time": "9m 53s", "remaining_time": "1h 8m 24s"} +{"loss": 0.47588787, "token_acc": 0.8449714, "grad_norm": 0.4045288, "learning_rate": 9.822e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209823, "epoch": 0.65656566, "global_step/max_steps": "130/990", "percentage": "13.13%", "elapsed_time": "10m 19s", "remaining_time": "1h 8m 16s"} +{"loss": 0.37209432, "token_acc": 0.87283399, "grad_norm": 0.4168182, "learning_rate": 9.8e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209914, "epoch": 0.68181818, "global_step/max_steps": "135/990", "percentage": "13.64%", "elapsed_time": "10m 42s", "remaining_time": "1h 7m 50s"} +{"loss": 0.53056355, "token_acc": 0.82782405, "grad_norm": 0.43215978, "learning_rate": 9.776e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20961, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "11m 7s", "remaining_time": "1h 7m 32s"} +{"eval_loss": 0.44480118, "eval_token_acc": 0.75024975, "eval_runtime": 1.8087, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "11m 9s", "remaining_time": "1h 7m 43s"} +{"loss": 0.34435036, "token_acc": 0.86174778, "grad_norm": 0.51403648, "learning_rate": 9.75e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.207394, "epoch": 0.73232323, "global_step/max_steps": "145/990", "percentage": "14.65%", "elapsed_time": "11m 38s", "remaining_time": "1h 7m 52s"} +{"loss": 0.40156693, "token_acc": 0.87209497, "grad_norm": 0.89774364, "learning_rate": 9.723e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20834, "epoch": 0.75757576, "global_step/max_steps": "150/990", "percentage": "15.15%", "elapsed_time": "11m 59s", "remaining_time": "1h 7m 9s"} +{"loss": 0.41990581, "token_acc": 0.85655379, "grad_norm": 0.60194051, "learning_rate": 9.695e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20801, "epoch": 0.78282828, "global_step/max_steps": "155/990", "percentage": "15.66%", "elapsed_time": "12m 24s", "remaining_time": "1h 6m 52s"} +{"loss": 0.3477031, "token_acc": 0.87954081, "grad_norm": 0.41383395, "learning_rate": 9.666e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208783, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "12m 45s", "remaining_time": "1h 6m 13s"} +{"eval_loss": 0.42330542, "eval_token_acc": 0.74425574, "eval_runtime": 1.8491, "eval_samples_per_second": 2.163, "eval_steps_per_second": 2.163, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "12m 47s", "remaining_time": "1h 6m 23s"} +{"loss": 0.40957146, "token_acc": 0.84564214, "grad_norm": 0.28167769, "learning_rate": 9.635e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20789, "epoch": 0.83333333, "global_step/max_steps": "165/990", "percentage": "16.67%", "elapsed_time": "13m 13s", "remaining_time": "1h 6m 6s"} +{"loss": 0.45749435, "token_acc": 0.85327165, "grad_norm": 0.41569507, "learning_rate": 9.603e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20746, "epoch": 0.85858586, "global_step/max_steps": "170/990", "percentage": "17.17%", "elapsed_time": "13m 39s", "remaining_time": "1h 5m 50s"} +{"loss": 0.41641445, "token_acc": 0.85243944, "grad_norm": 0.49325898, "learning_rate": 9.57e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208373, "epoch": 0.88383838, "global_step/max_steps": "175/990", "percentage": "17.68%", "elapsed_time": "13m 59s", "remaining_time": "1h 5m 9s"} +{"loss": 0.42221174, "token_acc": 0.86205251, "grad_norm": 0.36325562, "learning_rate": 9.535e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208791, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "14m 21s", "remaining_time": "1h 4m 37s"} +{"eval_loss": 0.45568228, "eval_token_acc": 0.75824176, "eval_runtime": 1.8409, "eval_samples_per_second": 2.173, "eval_steps_per_second": 2.173, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "14m 23s", "remaining_time": "1h 4m 46s"} +{"loss": 0.49337053, "token_acc": 0.8209519, "grad_norm": 0.47751582, "learning_rate": 9.5e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.207976, "epoch": 0.93434343, "global_step/max_steps": "185/990", "percentage": "18.69%", "elapsed_time": "14m 49s", "remaining_time": "1h 4m 29s"} +{"loss": 0.72712517, "token_acc": 0.79656241, "grad_norm": 2.74287009, "learning_rate": 9.463e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209678, "epoch": 0.95959596, "global_step/max_steps": "190/990", "percentage": "19.19%", "elapsed_time": "15m 5s", "remaining_time": "1h 3m 33s"} +{"loss": 0.54947252, "token_acc": 0.81973392, "grad_norm": 0.46910813, "learning_rate": 9.424e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208894, "epoch": 0.98484848, "global_step/max_steps": "195/990", "percentage": "19.70%", "elapsed_time": "15m 33s", "remaining_time": "1h 3m 24s"} +{"loss": 0.473172, "token_acc": 0.85670992, "grad_norm": 0.29799834, "learning_rate": 9.385e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20626, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "16m 9s", "remaining_time": "1h 3m 48s"} +{"eval_loss": 0.49900115, "eval_token_acc": 0.75024975, "eval_runtime": 1.8407, "eval_samples_per_second": 2.173, "eval_steps_per_second": 2.173, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "16m 11s", "remaining_time": "1h 3m 55s"} +{"loss": 0.42464404, "token_acc": 0.83732057, "grad_norm": 0.38004476, "learning_rate": 9.344e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203763, "epoch": 1.03535354, "global_step/max_steps": "205/990", "percentage": "20.71%", "elapsed_time": "16m 45s", "remaining_time": "1h 4m 11s"} +{"loss": 0.35454028, "token_acc": 0.87744555, "grad_norm": 0.43479174, "learning_rate": 9.302e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203468, "epoch": 1.06060606, "global_step/max_steps": "210/990", "percentage": "21.21%", "elapsed_time": "17m 11s", "remaining_time": "1h 3m 52s"} +{"loss": 0.32962794, "token_acc": 0.88917552, "grad_norm": 0.38388073, "learning_rate": 9.259e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202581, "epoch": 1.08585859, "global_step/max_steps": "215/990", "percentage": "21.72%", "elapsed_time": "17m 40s", "remaining_time": "1h 3m 44s"} +{"loss": 0.29720905, "token_acc": 0.90020278, "grad_norm": 0.54890692, "learning_rate": 9.214e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202983, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "18m 3s", "remaining_time": "1h 3m 12s"} +{"eval_loss": 0.50207913, "eval_token_acc": 0.75324675, "eval_runtime": 1.8362, "eval_samples_per_second": 2.178, "eval_steps_per_second": 2.178, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "18m 5s", "remaining_time": "1h 3m 18s"} +{"loss": 0.42758522, "token_acc": 0.85019073, "grad_norm": 0.60289013, "learning_rate": 9.169e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202184, "epoch": 1.13636364, "global_step/max_steps": "225/990", "percentage": "22.73%", "elapsed_time": "18m 32s", "remaining_time": "1h 3m 2s"} +{"loss": 0.3600594, "token_acc": 0.87542388, "grad_norm": 0.39421549, "learning_rate": 9.122e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.200825, "epoch": 1.16161616, "global_step/max_steps": "230/990", "percentage": "23.23%", "elapsed_time": "19m 4s", "remaining_time": "1h 3m 3s"} +{"loss": 0.15720742, "token_acc": 0.92824193, "grad_norm": 0.46965522, "learning_rate": 9.074e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202308, "epoch": 1.18686869, "global_step/max_steps": "235/990", "percentage": "23.74%", "elapsed_time": "19m 21s", "remaining_time": "1h 2m 10s"} +{"loss": 0.38554611, "token_acc": 0.87112641, "grad_norm": 0.74775624, "learning_rate": 9.025e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202911, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "19m 42s", "remaining_time": "1h 1m 35s"} +{"eval_loss": 0.53340024, "eval_token_acc": 0.74425574, "eval_runtime": 1.8268, "eval_samples_per_second": 2.19, "eval_steps_per_second": 2.19, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "19m 44s", "remaining_time": "1h 1m 40s"} +{"loss": 0.24191198, "token_acc": 0.8654323, "grad_norm": 0.50347519, "learning_rate": 8.975e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203143, "epoch": 1.23737374, "global_step/max_steps": "245/990", "percentage": "24.75%", "elapsed_time": "20m 5s", "remaining_time": "1h 1m 6s"} +{"loss": 0.37986381, "token_acc": 0.86459489, "grad_norm": 0.79619437, "learning_rate": 8.924e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204558, "epoch": 1.26262626, "global_step/max_steps": "250/990", "percentage": "25.25%", "elapsed_time": "20m 21s", "remaining_time": "1h 0m 16s"} +{"loss": 0.34370255, "token_acc": 0.87003342, "grad_norm": 0.45351708, "learning_rate": 8.872e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203403, "epoch": 1.28787879, "global_step/max_steps": "255/990", "percentage": "25.76%", "elapsed_time": "20m 53s", "remaining_time": "1h 0m 12s"} +{"loss": 0.32487206, "token_acc": 0.87575705, "grad_norm": 0.70948952, "learning_rate": 8.818e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204064, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "21m 13s", "remaining_time": "59m 36s"} +{"eval_loss": 0.46274555, "eval_token_acc": 0.75024975, "eval_runtime": 1.8037, "eval_samples_per_second": 2.218, "eval_steps_per_second": 2.218, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "21m 15s", "remaining_time": "59m 41s"} +{"loss": 0.25544384, "token_acc": 0.9007542, "grad_norm": 0.22084305, "learning_rate": 8.764e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.201941, "epoch": 1.33838384, "global_step/max_steps": "265/990", "percentage": "26.77%", "elapsed_time": "21m 51s", "remaining_time": "59m 49s"} +{"loss": 0.23369157, "token_acc": 0.91072653, "grad_norm": 0.44375062, "learning_rate": 8.708e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202564, "epoch": 1.36363636, "global_step/max_steps": "270/990", "percentage": "27.27%", "elapsed_time": "22m 12s", "remaining_time": "59m 13s"} +{"loss": 0.25317478, "token_acc": 0.9063346, "grad_norm": 0.42193475, "learning_rate": 8.652e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20219, "epoch": 1.38888889, "global_step/max_steps": "275/990", "percentage": "27.78%", "elapsed_time": "22m 39s", "remaining_time": "58m 55s"} +{"loss": 0.37001193, "token_acc": 0.88657612, "grad_norm": 0.82838076, "learning_rate": 8.594e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20285, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "22m 59s", "remaining_time": "58m 19s"} +{"eval_loss": 0.42734003, "eval_token_acc": 0.75524476, "eval_runtime": 1.8284, "eval_samples_per_second": 2.188, "eval_steps_per_second": 2.188, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "23m 1s", "remaining_time": "58m 23s"} +{"loss": 0.1801905, "token_acc": 0.90671059, "grad_norm": 0.70956218, "learning_rate": 8.536e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203001, "epoch": 1.43939394, "global_step/max_steps": "285/990", "percentage": "28.79%", "elapsed_time": "23m 23s", "remaining_time": "57m 52s"} +{"loss": 0.33577933, "token_acc": 0.8656873, "grad_norm": 0.77200943, "learning_rate": 8.476e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203902, "epoch": 1.46464646, "global_step/max_steps": "290/990", "percentage": "29.29%", "elapsed_time": "23m 41s", "remaining_time": "57m 12s"} +{"loss": 0.34617891, "token_acc": 0.85153145, "grad_norm": 1.25244784, "learning_rate": 8.415e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204926, "epoch": 1.48989899, "global_step/max_steps": "295/990", "percentage": "29.80%", "elapsed_time": "23m 59s", "remaining_time": "56m 30s"} +{"loss": 0.2885335, "token_acc": 0.89046563, "grad_norm": 2.09505773, "learning_rate": 8.354e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.205093, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "24m 22s", "remaining_time": "56m 3s"} +{"eval_loss": 0.43511924, "eval_token_acc": 0.74325674, "eval_runtime": 1.8124, "eval_samples_per_second": 2.207, "eval_steps_per_second": 2.207, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "24m 24s", "remaining_time": "56m 7s"} +{"loss": 0.33441875, "token_acc": 0.87207799, "grad_norm": 0.51694202, "learning_rate": 8.291e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20388, "epoch": 1.54040404, "global_step/max_steps": "305/990", "percentage": "30.81%", "elapsed_time": "24m 55s", "remaining_time": "55m 59s"} +{"loss": 0.25347624, "token_acc": 0.90839866, "grad_norm": 0.50443596, "learning_rate": 8.228e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20477, "epoch": 1.56565657, "global_step/max_steps": "310/990", "percentage": "31.31%", "elapsed_time": "25m 13s", "remaining_time": "55m 20s"} +{"loss": 0.23027022, "token_acc": 0.90536834, "grad_norm": 1.48407209, "learning_rate": 8.164e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205493, "epoch": 1.59090909, "global_step/max_steps": "315/990", "percentage": "31.82%", "elapsed_time": "25m 32s", "remaining_time": "54m 44s"} +{"loss": 0.2922214, "token_acc": 0.9086255, "grad_norm": 0.47270119, "learning_rate": 8.099e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205726, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "25m 55s", "remaining_time": "54m 16s"} +{"eval_loss": 0.37549704, "eval_token_acc": 0.75724276, "eval_runtime": 1.7963, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "25m 56s", "remaining_time": "54m 19s"} +{"loss": 0.29368644, "token_acc": 0.86976493, "grad_norm": 0.53276747, "learning_rate": 8.033e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205415, "epoch": 1.64141414, "global_step/max_steps": "325/990", "percentage": "32.83%", "elapsed_time": "26m 21s", "remaining_time": "53m 56s"} +{"loss": 0.35686882, "token_acc": 0.89534634, "grad_norm": 0.73298174, "learning_rate": 7.966e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20625, "epoch": 1.66666667, "global_step/max_steps": "330/990", "percentage": "33.33%", "elapsed_time": "26m 39s", "remaining_time": "53m 19s"} +{"loss": 0.28799734, "token_acc": 0.89158667, "grad_norm": 0.46463269, "learning_rate": 7.898e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206245, "epoch": 1.69191919, "global_step/max_steps": "335/990", "percentage": "33.84%", "elapsed_time": "27m 3s", "remaining_time": "52m 55s"} +{"loss": 0.19362801, "token_acc": 0.92170578, "grad_norm": 0.98276621, "learning_rate": 7.83e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207024, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "27m 21s", "remaining_time": "52m 19s"} +{"eval_loss": 0.36126405, "eval_token_acc": 0.74925075, "eval_runtime": 1.7972, "eval_samples_per_second": 2.226, "eval_steps_per_second": 2.226, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "27m 23s", "remaining_time": "52m 22s"} +{"loss": 0.24427512, "token_acc": 0.90341463, "grad_norm": 0.53730303, "learning_rate": 7.76e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20636, "epoch": 1.74242424, "global_step/max_steps": "345/990", "percentage": "34.85%", "elapsed_time": "27m 51s", "remaining_time": "52m 4s"} +{"loss": 0.17367718, "token_acc": 0.93241167, "grad_norm": 1.43868363, "learning_rate": 7.69e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207166, "epoch": 1.76767677, "global_step/max_steps": "350/990", "percentage": "35.35%", "elapsed_time": "28m 9s", "remaining_time": "51m 28s"} +{"loss": 0.35419593, "token_acc": 0.8842234, "grad_norm": 0.91168243, "learning_rate": 7.62e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207241, "epoch": 1.79292929, "global_step/max_steps": "355/990", "percentage": "35.86%", "elapsed_time": "28m 32s", "remaining_time": "51m 3s"} +{"loss": 0.22004578, "token_acc": 0.90809328, "grad_norm": 0.32813597, "learning_rate": 7.548e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207214, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "28m 56s", "remaining_time": "50m 39s"} +{"eval_loss": 0.35305825, "eval_token_acc": 0.75324675, "eval_runtime": 1.8366, "eval_samples_per_second": 2.178, "eval_steps_per_second": 2.178, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "28m 58s", "remaining_time": "50m 42s"} +{"loss": 0.28913445, "token_acc": 0.87079473, "grad_norm": 0.90348744, "learning_rate": 7.476e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207182, "epoch": 1.84343434, "global_step/max_steps": "365/990", "percentage": "36.87%", "elapsed_time": "29m 21s", "remaining_time": "50m 16s"} +{"loss": 0.29883947, "token_acc": 0.89075844, "grad_norm": 0.37799835, "learning_rate": 7.403e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207123, "epoch": 1.86868687, "global_step/max_steps": "370/990", "percentage": "37.37%", "elapsed_time": "29m 46s", "remaining_time": "49m 52s"} +{"loss": 0.34903314, "token_acc": 0.88231836, "grad_norm": 0.55158281, "learning_rate": 7.329e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206817, "epoch": 1.89393939, "global_step/max_steps": "375/990", "percentage": "37.88%", "elapsed_time": "30m 12s", "remaining_time": "49m 33s"} +{"loss": 0.31674423, "token_acc": 0.89504373, "grad_norm": 0.71026605, "learning_rate": 7.255e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206935, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "30m 35s", "remaining_time": "49m 7s"} +{"eval_loss": 0.3127498, "eval_token_acc": 0.76323676, "eval_runtime": 1.8255, "eval_samples_per_second": 2.191, "eval_steps_per_second": 2.191, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "30m 37s", "remaining_time": "49m 10s"} +{"loss": 0.33083692, "token_acc": 0.88451646, "grad_norm": 0.24322477, "learning_rate": 7.18e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206393, "epoch": 1.94444444, "global_step/max_steps": "385/990", "percentage": "38.89%", "elapsed_time": "31m 5s", "remaining_time": "48m 50s"} +{"loss": 0.23392997, "token_acc": 0.90329243, "grad_norm": 0.39917448, "learning_rate": 7.105e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206252, "epoch": 1.96969697, "global_step/max_steps": "390/990", "percentage": "39.39%", "elapsed_time": "31m 30s", "remaining_time": "48m 28s"} +{"loss": 0.19444363, "token_acc": 0.91709269, "grad_norm": 0.38407907, "learning_rate": 7.029e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206907, "epoch": 1.99494949, "global_step/max_steps": "395/990", "percentage": "39.90%", "elapsed_time": "31m 48s", "remaining_time": "47m 55s"} +{"loss": 0.17200204, "token_acc": 0.95649392, "grad_norm": 0.51863116, "learning_rate": 6.952e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207883, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "32m 3s", "remaining_time": "47m 17s"} +{"eval_loss": 0.30318493, "eval_token_acc": 0.75424575, "eval_runtime": 1.7906, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "32m 5s", "remaining_time": "47m 20s"} +{"loss": 0.12331185, "token_acc": 0.9200657, "grad_norm": 0.69814432, "learning_rate": 6.875e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207599, "epoch": 2.04545455, "global_step/max_steps": "405/990", "percentage": "40.91%", "elapsed_time": "32m 30s", "remaining_time": "46m 57s"} +{"loss": 0.12755507, "token_acc": 0.93373408, "grad_norm": 0.99563086, "learning_rate": 6.797e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207712, "epoch": 2.07070707, "global_step/max_steps": "410/990", "percentage": "41.41%", "elapsed_time": "32m 53s", "remaining_time": "46m 31s"} +{"loss": 0.12629324, "token_acc": 0.94336712, "grad_norm": 0.57860726, "learning_rate": 6.719e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208254, "epoch": 2.0959596, "global_step/max_steps": "415/990", "percentage": "41.92%", "elapsed_time": "33m 12s", "remaining_time": "46m 0s"} +{"loss": 0.08971643, "token_acc": 0.98426905, "grad_norm": 0.40014225, "learning_rate": 6.64e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208984, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "33m 29s", "remaining_time": "45m 26s"} +{"eval_loss": 0.29718354, "eval_token_acc": 0.76223776, "eval_runtime": 1.8393, "eval_samples_per_second": 2.175, "eval_steps_per_second": 2.175, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "33m 31s", "remaining_time": "45m 29s"} +{"loss": 0.14678299, "token_acc": 0.91186944, "grad_norm": 0.64548421, "learning_rate": 6.561e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208889, "epoch": 2.14646465, "global_step/max_steps": "425/990", "percentage": "42.93%", "elapsed_time": "33m 54s", "remaining_time": "45m 4s"} +{"loss": 0.08001596, "token_acc": 0.97462557, "grad_norm": 0.30150402, "learning_rate": 6.481e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208643, "epoch": 2.17171717, "global_step/max_steps": "430/990", "percentage": "43.43%", "elapsed_time": "34m 20s", "remaining_time": "44m 43s"} +{"loss": 0.12085707, "token_acc": 0.96702482, "grad_norm": 0.35809425, "learning_rate": 6.401e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20794, "epoch": 2.1969697, "global_step/max_steps": "435/990", "percentage": "43.94%", "elapsed_time": "34m 51s", "remaining_time": "44m 28s"} +{"loss": 0.12868813, "token_acc": 0.95372962, "grad_norm": 0.83563262, "learning_rate": 6.321e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208265, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "35m 12s", "remaining_time": "44m 0s"} +{"eval_loss": 0.3201369, "eval_token_acc": 0.75424575, "eval_runtime": 1.8482, "eval_samples_per_second": 2.164, "eval_steps_per_second": 2.164, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "35m 14s", "remaining_time": "44m 2s"} +{"loss": 0.05988649, "token_acc": 0.9480653, "grad_norm": 0.37407261, "learning_rate": 6.24e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207886, "epoch": 2.24747475, "global_step/max_steps": "445/990", "percentage": "44.95%", "elapsed_time": "35m 40s", "remaining_time": "43m 41s"} +{"loss": 0.19066305, "token_acc": 0.93323331, "grad_norm": 0.86166501, "learning_rate": 6.159e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208264, "epoch": 2.27272727, "global_step/max_steps": "450/990", "percentage": "45.45%", "elapsed_time": "36m 0s", "remaining_time": "43m 12s"} +{"loss": 0.09232728, "token_acc": 0.96713935, "grad_norm": 0.34099314, "learning_rate": 6.078e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208355, "epoch": 2.2979798, "global_step/max_steps": "455/990", "percentage": "45.96%", "elapsed_time": "36m 23s", "remaining_time": "42m 47s"} +{"loss": 0.11596426, "token_acc": 0.96093127, "grad_norm": 0.35433683, "learning_rate": 5.996e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208578, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "36m 45s", "remaining_time": "42m 20s"} +{"eval_loss": 0.32146299, "eval_token_acc": 0.75824176, "eval_runtime": 1.8025, "eval_samples_per_second": 2.219, "eval_steps_per_second": 2.219, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "36m 46s", "remaining_time": "42m 22s"} +{"loss": 0.11863927, "token_acc": 0.91221454, "grad_norm": 0.52673078, "learning_rate": 5.914e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208792, "epoch": 2.34848485, "global_step/max_steps": "465/990", "percentage": "46.97%", "elapsed_time": "37m 6s", "remaining_time": "41m 54s"} +{"loss": 0.07829801, "token_acc": 0.96805003, "grad_norm": 0.70425922, "learning_rate": 5.832e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208441, "epoch": 2.37373737, "global_step/max_steps": "470/990", "percentage": "47.47%", "elapsed_time": "37m 34s", "remaining_time": "41m 34s"} +{"loss": 0.09042804, "token_acc": 0.9607806, "grad_norm": 0.85261846, "learning_rate": 5.749e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208835, "epoch": 2.3989899, "global_step/max_steps": "475/990", "percentage": "47.98%", "elapsed_time": "37m 54s", "remaining_time": "41m 5s"} +{"loss": 0.17373031, "token_acc": 0.94228663, "grad_norm": 0.34660023, "learning_rate": 5.666e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207896, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "38m 28s", "remaining_time": "40m 52s"} +{"eval_loss": 0.31417391, "eval_token_acc": 0.75824176, "eval_runtime": 1.7942, "eval_samples_per_second": 2.229, "eval_steps_per_second": 2.229, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "38m 30s", "remaining_time": "40m 54s"} +{"loss": 0.13746129, "token_acc": 0.9038552, "grad_norm": 0.90347826, "learning_rate": 5.584e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208273, "epoch": 2.44949495, "global_step/max_steps": "485/990", "percentage": "48.99%", "elapsed_time": "38m 48s", "remaining_time": "40m 24s"} +{"loss": 0.18579383, "token_acc": 0.92807018, "grad_norm": 0.57208484, "learning_rate": 5.5e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208224, "epoch": 2.47474747, "global_step/max_steps": "490/990", "percentage": "49.49%", "elapsed_time": "39m 12s", "remaining_time": "40m 0s"} +{"loss": 0.11083965, "token_acc": 0.95607613, "grad_norm": 0.35935673, "learning_rate": 5.417e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208702, "epoch": 2.5, "global_step/max_steps": "495/990", "percentage": "50.00%", "elapsed_time": "39m 31s", "remaining_time": "39m 31s"} +{"loss": 0.17410432, "token_acc": 0.9156242, "grad_norm": 0.54744899, "learning_rate": 5.334e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208566, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "39m 56s", "remaining_time": "39m 9s"} +{"eval_loss": 0.31219679, "eval_token_acc": 0.75624376, "eval_runtime": 1.8195, "eval_samples_per_second": 2.198, "eval_steps_per_second": 2.198, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "39m 58s", "remaining_time": "39m 10s"} +{"loss": 0.09475701, "token_acc": 0.92925571, "grad_norm": 0.59536976, "learning_rate": 5.251e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208436, "epoch": 2.55050505, "global_step/max_steps": "505/990", "percentage": "51.01%", "elapsed_time": "40m 22s", "remaining_time": "38m 46s"} +{"loss": 0.1080175, "token_acc": 0.96735395, "grad_norm": 0.60277241, "learning_rate": 5.167e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208985, "epoch": 2.57575758, "global_step/max_steps": "510/990", "percentage": "51.52%", "elapsed_time": "40m 40s", "remaining_time": "38m 16s"} +{"loss": 0.1285596, "token_acc": 0.95174089, "grad_norm": 0.37383988, "learning_rate": 5.084e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209255, "epoch": 2.6010101, "global_step/max_steps": "515/990", "percentage": "52.02%", "elapsed_time": "41m 0s", "remaining_time": "37m 49s"} +{"loss": 0.13147178, "token_acc": 0.9425986, "grad_norm": 0.43995336, "learning_rate": 5e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209221, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "41m 25s", "remaining_time": "37m 26s"} +{"eval_loss": 0.31124952, "eval_token_acc": 0.75524476, "eval_runtime": 1.8299, "eval_samples_per_second": 2.186, "eval_steps_per_second": 2.186, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "41m 26s", "remaining_time": "37m 27s"} +{"loss": 0.08995424, "token_acc": 0.93090909, "grad_norm": 0.45797107, "learning_rate": 4.916e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209276, "epoch": 2.65151515, "global_step/max_steps": "525/990", "percentage": "53.03%", "elapsed_time": "41m 48s", "remaining_time": "37m 1s"} +{"loss": 0.15920202, "token_acc": 0.94420681, "grad_norm": 0.38593069, "learning_rate": 4.833e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208661, "epoch": 2.67676768, "global_step/max_steps": "530/990", "percentage": "53.54%", "elapsed_time": "42m 19s", "remaining_time": "36m 44s"} +{"loss": 0.10588231, "token_acc": 0.96555543, "grad_norm": 0.90833467, "learning_rate": 4.749e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208205, "epoch": 2.7020202, "global_step/max_steps": "535/990", "percentage": "54.04%", "elapsed_time": "42m 49s", "remaining_time": "36m 25s"} +{"loss": 0.21194179, "token_acc": 0.91970121, "grad_norm": 0.90498143, "learning_rate": 4.666e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207772, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "43m 18s", "remaining_time": "36m 5s"} +{"eval_loss": 0.31014651, "eval_token_acc": 0.75524476, "eval_runtime": 1.805, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "43m 20s", "remaining_time": "36m 7s"} +{"loss": 0.15268012, "token_acc": 0.90911226, "grad_norm": 0.55956703, "learning_rate": 4.583e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207444, "epoch": 2.75252525, "global_step/max_steps": "545/990", "percentage": "55.05%", "elapsed_time": "43m 46s", "remaining_time": "35m 44s"} +{"loss": 0.10676954, "token_acc": 0.96086556, "grad_norm": 0.2707825, "learning_rate": 4.5e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207501, "epoch": 2.77777778, "global_step/max_steps": "550/990", "percentage": "55.56%", "elapsed_time": "44m 10s", "remaining_time": "35m 20s"} +{"loss": 0.09652482, "token_acc": 0.9671725, "grad_norm": 0.53631139, "learning_rate": 4.416e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207753, "epoch": 2.8030303, "global_step/max_steps": "555/990", "percentage": "56.06%", "elapsed_time": "44m 31s", "remaining_time": "34m 53s"} +{"loss": 0.11957679, "token_acc": 0.94442409, "grad_norm": 0.32662246, "learning_rate": 4.334e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20762, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "44m 56s", "remaining_time": "34m 30s"} +{"eval_loss": 0.30127066, "eval_token_acc": 0.75724276, "eval_runtime": 1.8237, "eval_samples_per_second": 2.193, "eval_steps_per_second": 2.193, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "44m 58s", "remaining_time": "34m 32s"} +{"loss": 0.14267081, "token_acc": 0.9234961, "grad_norm": 0.69588697, "learning_rate": 4.251e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207194, "epoch": 2.85353535, "global_step/max_steps": "565/990", "percentage": "57.07%", "elapsed_time": "45m 26s", "remaining_time": "34m 10s"} +{"loss": 0.1981081, "token_acc": 0.92469512, "grad_norm": 0.67560625, "learning_rate": 4.168e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206849, "epoch": 2.87878788, "global_step/max_steps": "570/990", "percentage": "57.58%", "elapsed_time": "45m 55s", "remaining_time": "33m 50s"} +{"loss": 0.10450697, "token_acc": 0.95728411, "grad_norm": 0.84433973, "learning_rate": 4.086e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206796, "epoch": 2.9040404, "global_step/max_steps": "575/990", "percentage": "58.08%", "elapsed_time": "46m 20s", "remaining_time": "33m 26s"} +{"loss": 0.15350983, "token_acc": 0.94093244, "grad_norm": 3.98355579, "learning_rate": 4.004e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207187, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "46m 39s", "remaining_time": "32m 58s"} +{"eval_loss": 0.30419731, "eval_token_acc": 0.75324675, "eval_runtime": 1.8171, "eval_samples_per_second": 2.201, "eval_steps_per_second": 2.201, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "46m 40s", "remaining_time": "32m 59s"} +{"loss": 0.14068719, "token_acc": 0.90724117, "grad_norm": 0.72939622, "learning_rate": 3.922e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207381, "epoch": 2.95454545, "global_step/max_steps": "585/990", "percentage": "59.09%", "elapsed_time": "47m 0s", "remaining_time": "32m 32s"} +{"loss": 0.13580102, "token_acc": 0.95163189, "grad_norm": 0.67629957, "learning_rate": 3.841e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206957, "epoch": 2.97979798, "global_step/max_steps": "590/990", "percentage": "59.60%", "elapsed_time": "47m 30s", "remaining_time": "32m 12s"} +{"loss": 0.12651438, "token_acc": 0.96316623, "grad_norm": 0.22191222, "learning_rate": 3.76e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206604, "epoch": 3.00505051, "global_step/max_steps": "595/990", "percentage": "60.10%", "elapsed_time": "47m 59s", "remaining_time": "31m 51s"} +{"loss": 0.05826302, "token_acc": 0.98315354, "grad_norm": 0.55185503, "learning_rate": 3.679e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206751, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "48m 21s", "remaining_time": "31m 26s"} +{"eval_loss": 0.30322751, "eval_token_acc": 0.75224775, "eval_runtime": 1.788, "eval_samples_per_second": 2.237, "eval_steps_per_second": 2.237, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "48m 23s", "remaining_time": "31m 27s"} +{"loss": 0.03953178, "token_acc": 0.95907578, "grad_norm": 0.61902952, "learning_rate": 3.599e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206057, "epoch": 3.05555556, "global_step/max_steps": "605/990", "percentage": "61.11%", "elapsed_time": "48m 55s", "remaining_time": "31m 8s"} +{"loss": 0.02162913, "token_acc": 0.9906428, "grad_norm": 0.25448671, "learning_rate": 3.519e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206434, "epoch": 3.08080808, "global_step/max_steps": "610/990", "percentage": "61.62%", "elapsed_time": "49m 14s", "remaining_time": "30m 40s"} +{"loss": 0.01217525, "token_acc": 0.99504281, "grad_norm": 0.06814919, "learning_rate": 3.439e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206853, "epoch": 3.10606061, "global_step/max_steps": "615/990", "percentage": "62.12%", "elapsed_time": "49m 32s", "remaining_time": "30m 12s"} +{"loss": 0.03373536, "token_acc": 0.98868266, "grad_norm": 0.41807011, "learning_rate": 3.36e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206723, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "49m 58s", "remaining_time": "29m 49s"} +{"eval_loss": 0.311775, "eval_token_acc": 0.75524476, "eval_runtime": 1.8259, "eval_samples_per_second": 2.191, "eval_steps_per_second": 2.191, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "50m 0s", "remaining_time": "29m 50s"} +{"loss": 0.03276803, "token_acc": 0.96537355, "grad_norm": 0.43156824, "learning_rate": 3.281e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206122, "epoch": 3.15656566, "global_step/max_steps": "625/990", "percentage": "63.13%", "elapsed_time": "50m 31s", "remaining_time": "29m 30s"} +{"loss": 0.03847094, "token_acc": 0.98682008, "grad_norm": 0.34091553, "learning_rate": 3.203e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206497, "epoch": 3.18181818, "global_step/max_steps": "630/990", "percentage": "63.64%", "elapsed_time": "50m 50s", "remaining_time": "29m 3s"} +{"loss": 0.07642164, "token_acc": 0.97017304, "grad_norm": 0.82255036, "learning_rate": 3.125e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206299, "epoch": 3.20707071, "global_step/max_steps": "635/990", "percentage": "64.14%", "elapsed_time": "51m 17s", "remaining_time": "28m 40s"} +{"loss": 0.06662977, "token_acc": 0.96961407, "grad_norm": 0.3956908, "learning_rate": 3.048e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205891, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "51m 48s", "remaining_time": "28m 19s"} +{"eval_loss": 0.30408666, "eval_token_acc": 0.75524476, "eval_runtime": 1.796, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "51m 49s", "remaining_time": "28m 20s"} +{"loss": 0.05522508, "token_acc": 0.95495415, "grad_norm": 0.51107395, "learning_rate": 2.971e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20523, "epoch": 3.25757576, "global_step/max_steps": "645/990", "percentage": "65.15%", "elapsed_time": "52m 22s", "remaining_time": "28m 0s"} +{"loss": 0.05822249, "token_acc": 0.98284064, "grad_norm": 0.30346569, "learning_rate": 2.895e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205468, "epoch": 3.28282828, "global_step/max_steps": "650/990", "percentage": "65.66%", "elapsed_time": "52m 43s", "remaining_time": "27m 34s"} +{"loss": 0.02679656, "token_acc": 0.99297095, "grad_norm": 0.39017254, "learning_rate": 2.82e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205582, "epoch": 3.30808081, "global_step/max_steps": "655/990", "percentage": "66.16%", "elapsed_time": "53m 5s", "remaining_time": "27m 9s"} +{"loss": 0.04556843, "token_acc": 0.98337292, "grad_norm": 0.28393775, "learning_rate": 2.745e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.2055, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "53m 31s", "remaining_time": "26m 45s"} +{"eval_loss": 0.30582681, "eval_token_acc": 0.75624376, "eval_runtime": 1.7988, "eval_samples_per_second": 2.224, "eval_steps_per_second": 2.224, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "53m 33s", "remaining_time": "26m 46s"} +{"loss": 0.04427232, "token_acc": 0.94053745, "grad_norm": 0.72597456, "learning_rate": 2.671e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205656, "epoch": 3.35858586, "global_step/max_steps": "665/990", "percentage": "67.17%", "elapsed_time": "53m 53s", "remaining_time": "26m 20s"} +{"loss": 0.04415865, "token_acc": 0.98261694, "grad_norm": 0.69115818, "learning_rate": 2.597e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205791, "epoch": 3.38383838, "global_step/max_steps": "670/990", "percentage": "67.68%", "elapsed_time": "54m 15s", "remaining_time": "25m 54s"} +{"loss": 0.03833087, "token_acc": 0.98438048, "grad_norm": 0.36632708, "learning_rate": 2.524e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205991, "epoch": 3.40909091, "global_step/max_steps": "675/990", "percentage": "68.18%", "elapsed_time": "54m 36s", "remaining_time": "25m 29s"} +{"loss": 0.05873996, "token_acc": 0.97440117, "grad_norm": 0.43370181, "learning_rate": 2.452e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206296, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "54m 55s", "remaining_time": "25m 2s"} +{"eval_loss": 0.30511844, "eval_token_acc": 0.75124875, "eval_runtime": 1.8159, "eval_samples_per_second": 2.203, "eval_steps_per_second": 2.203, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "54m 57s", "remaining_time": "25m 3s"} +{"loss": 0.07444832, "token_acc": 0.9341205, "grad_norm": 0.38646677, "learning_rate": 2.38e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205939, "epoch": 3.45959596, "global_step/max_steps": "685/990", "percentage": "69.19%", "elapsed_time": "55m 25s", "remaining_time": "24m 40s"} +{"loss": 0.05726301, "token_acc": 0.97225101, "grad_norm": 0.93468893, "learning_rate": 2.31e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205757, "epoch": 3.48484848, "global_step/max_steps": "690/990", "percentage": "69.70%", "elapsed_time": "55m 53s", "remaining_time": "24m 17s"} +{"loss": 0.05963586, "token_acc": 0.97833715, "grad_norm": 0.51441658, "learning_rate": 2.24e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205924, "epoch": 3.51010101, "global_step/max_steps": "695/990", "percentage": "70.20%", "elapsed_time": "56m 14s", "remaining_time": "23m 52s"} +{"loss": 0.00992876, "token_acc": 0.99569454, "grad_norm": 0.23240763, "learning_rate": 2.17e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206296, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "56m 32s", "remaining_time": "23m 25s"} +{"eval_loss": 0.30824292, "eval_token_acc": 0.74825175, "eval_runtime": 1.8346, "eval_samples_per_second": 2.18, "eval_steps_per_second": 2.18, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "56m 34s", "remaining_time": "23m 26s"} +{"loss": 0.03623521, "token_acc": 0.94073235, "grad_norm": 0.35060033, "learning_rate": 2.102e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206441, "epoch": 3.56060606, "global_step/max_steps": "705/990", "percentage": "71.21%", "elapsed_time": "56m 54s", "remaining_time": "23m 0s"} +{"loss": 0.03102828, "token_acc": 0.98930706, "grad_norm": 0.42759782, "learning_rate": 2.034e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206201, "epoch": 3.58585859, "global_step/max_steps": "710/990", "percentage": "71.72%", "elapsed_time": "57m 22s", "remaining_time": "22m 37s"} +{"loss": 0.00243716, "token_acc": 0.99869508, "grad_norm": 0.20535572, "learning_rate": 1.967e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206895, "epoch": 3.61111111, "global_step/max_steps": "715/990", "percentage": "72.22%", "elapsed_time": "57m 35s", "remaining_time": "22m 9s"} +{"loss": 0.0139591, "token_acc": 0.9963244, "grad_norm": 0.79736304, "learning_rate": 1.901e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206763, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "58m 1s", "remaining_time": "21m 45s"} +{"eval_loss": 0.3128719, "eval_token_acc": 0.74925075, "eval_runtime": 1.8131, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "58m 3s", "remaining_time": "21m 46s"} +{"loss": 0.03765462, "token_acc": 0.95605043, "grad_norm": 0.45141226, "learning_rate": 1.836e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206518, "epoch": 3.66161616, "global_step/max_steps": "725/990", "percentage": "73.23%", "elapsed_time": "58m 30s", "remaining_time": "21m 23s"} +{"loss": 0.05053359, "token_acc": 0.98093347, "grad_norm": 0.28638807, "learning_rate": 1.772e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206217, "epoch": 3.68686869, "global_step/max_steps": "730/990", "percentage": "73.74%", "elapsed_time": "58m 59s", "remaining_time": "21m 0s"} +{"loss": 0.03042393, "token_acc": 0.98569077, "grad_norm": 0.37708166, "learning_rate": 1.709e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206105, "epoch": 3.71212121, "global_step/max_steps": "735/990", "percentage": "74.24%", "elapsed_time": "59m 25s", "remaining_time": "20m 37s"} +{"loss": 0.04238355, "token_acc": 0.98681621, "grad_norm": 0.56780881, "learning_rate": 1.646e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206402, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "59m 44s", "remaining_time": "20m 11s"} +{"eval_loss": 0.31193128, "eval_token_acc": 0.75624376, "eval_runtime": 1.8083, "eval_samples_per_second": 2.212, "eval_steps_per_second": 2.212, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "59m 46s", "remaining_time": "20m 11s"} +{"loss": 0.03465599, "token_acc": 0.92081633, "grad_norm": 0.344152, "learning_rate": 1.585e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206715, "epoch": 3.76262626, "global_step/max_steps": "745/990", "percentage": "75.25%", "elapsed_time": "1h 0m 3s", "remaining_time": "19m 45s"} +{"loss": 0.03131323, "token_acc": 0.99100379, "grad_norm": 0.78084624, "learning_rate": 1.524e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207112, "epoch": 3.78787879, "global_step/max_steps": "750/990", "percentage": "75.76%", "elapsed_time": "1h 0m 20s", "remaining_time": "19m 18s"} +{"loss": 0.10859463, "token_acc": 0.9532801, "grad_norm": 0.49152356, "learning_rate": 1.464e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206346, "epoch": 3.81313131, "global_step/max_steps": "755/990", "percentage": "76.26%", "elapsed_time": "1h 0m 58s", "remaining_time": "18m 58s"} +{"loss": 0.01086569, "token_acc": 0.99732763, "grad_norm": 0.36231735, "learning_rate": 1.406e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206831, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "1h 1m 14s", "remaining_time": "18m 31s"} +{"eval_loss": 0.30831078, "eval_token_acc": 0.75424575, "eval_runtime": 1.7998, "eval_samples_per_second": 2.222, "eval_steps_per_second": 2.222, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "1h 1m 15s", "remaining_time": "18m 32s"} +{"loss": 0.05528296, "token_acc": 0.94007663, "grad_norm": 0.46308312, "learning_rate": 1.348e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206812, "epoch": 3.86363636, "global_step/max_steps": "765/990", "percentage": "77.27%", "elapsed_time": "1h 1m 38s", "remaining_time": "18m 7s"} +{"loss": 0.02886441, "token_acc": 0.9896856, "grad_norm": 0.10027549, "learning_rate": 1.292e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206816, "epoch": 3.88888889, "global_step/max_steps": "770/990", "percentage": "77.78%", "elapsed_time": "1h 2m 2s", "remaining_time": "17m 43s"} +{"loss": 0.03930848, "token_acc": 0.9889024, "grad_norm": 0.4784112, "learning_rate": 1.236e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206798, "epoch": 3.91414141, "global_step/max_steps": "775/990", "percentage": "78.28%", "elapsed_time": "1h 2m 27s", "remaining_time": "17m 19s"} +{"loss": 0.04449717, "token_acc": 0.98045007, "grad_norm": 0.45229685, "learning_rate": 1.182e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206775, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "1h 2m 51s", "remaining_time": "16m 55s"} +{"eval_loss": 0.30842295, "eval_token_acc": 0.75324675, "eval_runtime": 1.8372, "eval_samples_per_second": 2.177, "eval_steps_per_second": 2.177, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "1h 2m 53s", "remaining_time": "16m 55s"} +{"loss": 0.03509678, "token_acc": 0.93630573, "grad_norm": 1.16340601, "learning_rate": 1.128e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206843, "epoch": 3.96464646, "global_step/max_steps": "785/990", "percentage": "79.29%", "elapsed_time": "1h 3m 14s", "remaining_time": "16m 30s"} +{"loss": 0.0488777, "token_acc": 0.98474141, "grad_norm": 0.57494467, "learning_rate": 1.076e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207005, "epoch": 3.98989899, "global_step/max_steps": "790/990", "percentage": "79.80%", "elapsed_time": "1h 3m 35s", "remaining_time": "16m 6s"} +{"loss": 0.02856588, "token_acc": 0.99008476, "grad_norm": 0.25825092, "learning_rate": 1.025e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20717, "epoch": 4.01515152, "global_step/max_steps": "795/990", "percentage": "80.30%", "elapsed_time": "1h 3m 57s", "remaining_time": "15m 41s"} +{"loss": 0.00884538, "token_acc": 0.99835549, "grad_norm": 0.34034023, "learning_rate": 9.75e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207207, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "1h 4m 20s", "remaining_time": "15m 16s"} +{"eval_loss": 0.30910566, "eval_token_acc": 0.74925075, "eval_runtime": 1.7909, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "1h 4m 22s", "remaining_time": "15m 17s"} +{"loss": 0.01580266, "token_acc": 0.96340232, "grad_norm": 0.36784557, "learning_rate": 9.26e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206982, "epoch": 4.06565657, "global_step/max_steps": "805/990", "percentage": "81.31%", "elapsed_time": "1h 4m 48s", "remaining_time": "14m 53s"} +{"loss": 0.00252092, "token_acc": 0.99980357, "grad_norm": 0.02392689, "learning_rate": 8.78e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207283, "epoch": 4.09090909, "global_step/max_steps": "810/990", "percentage": "81.82%", "elapsed_time": "1h 5m 7s", "remaining_time": "14m 28s"} +{"loss": 0.01393091, "token_acc": 0.99452804, "grad_norm": 0.29595932, "learning_rate": 8.31e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207408, "epoch": 4.11616162, "global_step/max_steps": "815/990", "percentage": "82.32%", "elapsed_time": "1h 5m 29s", "remaining_time": "14m 3s"} +{"loss": 0.0075201, "token_acc": 0.99843896, "grad_norm": 0.03354532, "learning_rate": 7.86e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207412, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "1h 5m 53s", "remaining_time": "13m 39s"} +{"eval_loss": 0.31242928, "eval_token_acc": 0.74925075, "eval_runtime": 1.8138, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "1h 5m 54s", "remaining_time": "13m 39s"} +{"loss": 0.02059356, "token_acc": 0.96786716, "grad_norm": 0.24018376, "learning_rate": 7.41e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207046, "epoch": 4.16666667, "global_step/max_steps": "825/990", "percentage": "83.33%", "elapsed_time": "1h 6m 24s", "remaining_time": "13m 16s"} +{"loss": 0.05117875, "token_acc": 0.96091348, "grad_norm": 0.33094808, "learning_rate": 6.98e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207063, "epoch": 4.19191919, "global_step/max_steps": "830/990", "percentage": "83.84%", "elapsed_time": "1h 6m 48s", "remaining_time": "12m 52s"} +{"loss": 0.02565375, "token_acc": 0.99176001, "grad_norm": 0.37649092, "learning_rate": 6.56e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207066, "epoch": 4.21717172, "global_step/max_steps": "835/990", "percentage": "84.34%", "elapsed_time": "1h 7m 12s", "remaining_time": "12m 28s"} +{"loss": 0.0273732, "token_acc": 0.98576512, "grad_norm": 0.34800857, "learning_rate": 6.15e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206927, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "1h 7m 39s", "remaining_time": "12m 4s"} +{"eval_loss": 0.31844291, "eval_token_acc": 0.74825175, "eval_runtime": 1.7906, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "1h 7m 40s", "remaining_time": "12m 5s"} +{"loss": 0.06719567, "token_acc": 0.95021459, "grad_norm": 0.72287565, "learning_rate": 5.76e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206416, "epoch": 4.26767677, "global_step/max_steps": "845/990", "percentage": "85.35%", "elapsed_time": "1h 8m 13s", "remaining_time": "11m 42s"} +{"loss": 0.04835308, "token_acc": 0.98187926, "grad_norm": 0.38411421, "learning_rate": 5.37e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206169, "epoch": 4.29292929, "global_step/max_steps": "850/990", "percentage": "85.86%", "elapsed_time": "1h 8m 42s", "remaining_time": "11m 18s"} +{"loss": 0.01264832, "token_acc": 0.99563223, "grad_norm": 0.10299125, "learning_rate": 5e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20603, "epoch": 4.31818182, "global_step/max_steps": "855/990", "percentage": "86.36%", "elapsed_time": "1h 9m 9s", "remaining_time": "10m 55s"} +{"loss": 0.01213498, "token_acc": 0.99677852, "grad_norm": 0.24367812, "learning_rate": 4.65e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205974, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "1h 9m 34s", "remaining_time": "10m 31s"} +{"eval_loss": 0.32311469, "eval_token_acc": 0.74925075, "eval_runtime": 1.7858, "eval_samples_per_second": 2.24, "eval_steps_per_second": 2.24, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "1h 9m 36s", "remaining_time": "10m 31s"} +{"loss": 0.01014752, "token_acc": 0.96458753, "grad_norm": 0.18834779, "learning_rate": 4.3e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20586, "epoch": 4.36868687, "global_step/max_steps": "865/990", "percentage": "87.37%", "elapsed_time": "1h 10m 1s", "remaining_time": "10m 7s"} +{"loss": 0.00840833, "token_acc": 0.99818302, "grad_norm": 0.1570287, "learning_rate": 3.97e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206013, "epoch": 4.39393939, "global_step/max_steps": "870/990", "percentage": "87.88%", "elapsed_time": "1h 10m 22s", "remaining_time": "9m 42s"} +{"loss": 0.00608762, "token_acc": 0.99690582, "grad_norm": 0.15527098, "learning_rate": 3.65e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206279, "epoch": 4.41919192, "global_step/max_steps": "875/990", "percentage": "88.38%", "elapsed_time": "1h 10m 41s", "remaining_time": "9m 17s"} +{"loss": 0.01747374, "token_acc": 0.99469777, "grad_norm": 0.49585438, "learning_rate": 3.34e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206368, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "1h 11m 3s", "remaining_time": "8m 52s"} +{"eval_loss": 0.32524875, "eval_token_acc": 0.74825175, "eval_runtime": 1.8805, "eval_samples_per_second": 2.127, "eval_steps_per_second": 2.127, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "1h 11m 5s", "remaining_time": "8m 53s"} +{"loss": 0.00792774, "token_acc": 0.96592954, "grad_norm": 0.23304193, "learning_rate": 3.05e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206247, "epoch": 4.46969697, "global_step/max_steps": "885/990", "percentage": "89.39%", "elapsed_time": "1h 11m 30s", "remaining_time": "8m 29s"} +{"loss": 0.01131476, "token_acc": 0.99672578, "grad_norm": 0.13684528, "learning_rate": 2.77e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206195, "epoch": 4.49494949, "global_step/max_steps": "890/990", "percentage": "89.90%", "elapsed_time": "1h 11m 55s", "remaining_time": "8m 4s"} +{"loss": 0.0211871, "token_acc": 0.99424697, "grad_norm": 0.33595765, "learning_rate": 2.5e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206448, "epoch": 4.52020202, "global_step/max_steps": "895/990", "percentage": "90.40%", "elapsed_time": "1h 12m 14s", "remaining_time": "7m 40s"} +{"loss": 0.03609364, "token_acc": 0.98884758, "grad_norm": 0.28014275, "learning_rate": 2.24e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206265, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "1h 12m 42s", "remaining_time": "7m 16s"} +{"eval_loss": 0.32550812, "eval_token_acc": 0.74725275, "eval_runtime": 1.8451, "eval_samples_per_second": 2.168, "eval_steps_per_second": 2.168, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "1h 12m 44s", "remaining_time": "7m 16s"} +{"loss": 0.01140303, "token_acc": 0.95904598, "grad_norm": 1.37669158, "learning_rate": 2e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206299, "epoch": 4.57070707, "global_step/max_steps": "905/990", "percentage": "91.41%", "elapsed_time": "1h 13m 6s", "remaining_time": "6m 51s"} +{"loss": 0.00628884, "token_acc": 0.99844277, "grad_norm": 0.06532011, "learning_rate": 1.78e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206688, "epoch": 4.5959596, "global_step/max_steps": "910/990", "percentage": "91.92%", "elapsed_time": "1h 13m 22s", "remaining_time": "6m 27s"} +{"loss": 0.01749746, "token_acc": 0.99283864, "grad_norm": 0.02101526, "learning_rate": 1.56e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206791, "epoch": 4.62121212, "global_step/max_steps": "915/990", "percentage": "92.42%", "elapsed_time": "1h 13m 44s", "remaining_time": "6m 2s"} +{"loss": 0.00437452, "token_acc": 0.99805532, "grad_norm": 0.11464821, "learning_rate": 1.36e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207068, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "1h 14m 2s", "remaining_time": "5m 38s"} +{"eval_loss": 0.32665226, "eval_token_acc": 0.74925075, "eval_runtime": 1.808, "eval_samples_per_second": 2.212, "eval_steps_per_second": 2.212, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "1h 14m 4s", "remaining_time": "5m 38s"} +{"loss": 0.00326997, "token_acc": 0.95751364, "grad_norm": 0.42058226, "learning_rate": 1.18e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207081, "epoch": 4.67171717, "global_step/max_steps": "925/990", "percentage": "93.43%", "elapsed_time": "1h 14m 26s", "remaining_time": "5m 13s"} +{"loss": 0.01534834, "token_acc": 0.99303898, "grad_norm": 0.00893018, "learning_rate": 1e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207309, "epoch": 4.6969697, "global_step/max_steps": "930/990", "percentage": "93.94%", "elapsed_time": "1h 14m 45s", "remaining_time": "4m 49s"} +{"loss": 0.02128749, "token_acc": 0.99040459, "grad_norm": 0.4243255, "learning_rate": 8.4e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207003, "epoch": 4.72222222, "global_step/max_steps": "935/990", "percentage": "94.44%", "elapsed_time": "1h 15m 16s", "remaining_time": "4m 25s"} +{"loss": 0.01530021, "token_acc": 0.99384667, "grad_norm": 0.09640122, "learning_rate": 7e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207138, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "1h 15m 37s", "remaining_time": "4m 1s"} +{"eval_loss": 0.32676643, "eval_token_acc": 0.75024975, "eval_runtime": 1.8049, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "1h 15m 39s", "remaining_time": "4m 1s"} +{"loss": 0.01212942, "token_acc": 0.97295147, "grad_norm": 0.09156661, "learning_rate": 5.6e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206551, "epoch": 4.77272727, "global_step/max_steps": "945/990", "percentage": "95.45%", "elapsed_time": "1h 16m 14s", "remaining_time": "3m 37s"} +{"loss": 0.00970393, "token_acc": 0.99487097, "grad_norm": 0.01458467, "learning_rate": 4.5e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206615, "epoch": 4.7979798, "global_step/max_steps": "950/990", "percentage": "95.96%", "elapsed_time": "1h 16m 37s", "remaining_time": "3m 13s"} +{"loss": 0.00385862, "token_acc": 0.99880668, "grad_norm": 0.09790432, "learning_rate": 3.4e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206618, "epoch": 4.82323232, "global_step/max_steps": "955/990", "percentage": "96.46%", "elapsed_time": "1h 17m 1s", "remaining_time": "2m 49s"} +{"loss": 0.02363491, "token_acc": 0.99180203, "grad_norm": 0.36749953, "learning_rate": 2.5e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206703, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "1h 17m 23s", "remaining_time": "2m 25s"} +{"eval_loss": 0.32730395, "eval_token_acc": 0.74825175, "eval_runtime": 1.8393, "eval_samples_per_second": 2.175, "eval_steps_per_second": 2.175, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "1h 17m 25s", "remaining_time": "2m 25s"} +{"loss": 0.01439833, "token_acc": 0.95524566, "grad_norm": 0.21629047, "learning_rate": 1.7e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206714, "epoch": 4.87373737, "global_step/max_steps": "965/990", "percentage": "97.47%", "elapsed_time": "1h 17m 47s", "remaining_time": "2m 0s"} +{"loss": 0.00892629, "token_acc": 0.99749541, "grad_norm": 0.26322067, "learning_rate": 1.1e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206903, "epoch": 4.8989899, "global_step/max_steps": "970/990", "percentage": "97.98%", "elapsed_time": "1h 18m 7s", "remaining_time": "1m 36s"} +{"loss": 0.00672083, "token_acc": 0.99857839, "grad_norm": 0.14011167, "learning_rate": 6e-08, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207185, "epoch": 4.92424242, "global_step/max_steps": "975/990", "percentage": "98.48%", "elapsed_time": "1h 18m 25s", "remaining_time": "1m 12s"} +{"loss": 0.03538868, "token_acc": 0.98341269, "grad_norm": 0.07906041, "learning_rate": 3e-08, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207226, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "1h 18m 48s", "remaining_time": "48s"} +{"eval_loss": 0.32695881, "eval_token_acc": 0.74725275, "eval_runtime": 1.7898, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "1h 18m 50s", "remaining_time": "48s"} +{"loss": 0.06200398, "token_acc": 0.95089808, "grad_norm": 0.3005389, "learning_rate": 1e-08, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206925, "epoch": 4.97474747, "global_step/max_steps": "985/990", "percentage": "99.49%", "elapsed_time": "1h 19m 19s", "remaining_time": "24s"} +{"loss": 0.00415896, "token_acc": 0.99902692, "grad_norm": 0.19153616, "learning_rate": 0.0, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207036, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 41s", "remaining_time": "0s"} +{"eval_loss": 0.32688043, "eval_token_acc": 0.74725275, "eval_runtime": 1.8656, "eval_samples_per_second": 2.144, "eval_steps_per_second": 2.144, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 43s", "remaining_time": "0s"} +{"train_runtime": 4786.0578, "train_samples_per_second": 0.414, "train_steps_per_second": 0.207, "total_flos": 6.384415932845261e+17, "train_loss": 0.19521469, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 46s", "remaining_time": "0s"} +{"train_dataset": "770.919192±617.756509, min=58.000000, max=4021.000000, size=396", "val_dataset": "321.000000±303.974506, min=102.000000, max=841.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 70760.8003M Params (207.0938M Trainable [0.2927%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-990", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/checkpoint-420", "best_metric": 0.29718354, "global_step": 990, "log_history": [{"loss": 0.569739043712616, "token_acc": 0.8513761467889909, "grad_norm": 0.5129857659339905, "learning_rate": 2.0000000000000003e-06, "memory(GiB)": 144.03, "train_speed(iter/s)": 0.135022, "epoch": 0.005050505050505051, "step": 1}, {"loss": 0.7468794584274292, "token_acc": 0.8297613248904043, "grad_norm": 0.8691070675849915, "learning_rate": 1e-05, "memory(GiB)": 153.42, "train_speed(iter/s)": 0.212168, "epoch": 0.025252525252525252, "step": 5}, {"loss": 0.7946175098419189, "token_acc": 0.787320071162866, "grad_norm": 0.5965867638587952, "learning_rate": 2e-05, "memory(GiB)": 160.41, "train_speed(iter/s)": 0.221849, "epoch": 0.050505050505050504, "step": 10}, {"loss": 0.7008682727813721, "token_acc": 0.8016944665078104, "grad_norm": 0.548611044883728, "learning_rate": 3e-05, "memory(GiB)": 170.14, "train_speed(iter/s)": 0.218317, "epoch": 0.07575757575757576, "step": 15}, {"loss": 0.5246500968933105, "token_acc": 0.8706467661691543, "grad_norm": 4.018751621246338, "learning_rate": 4e-05, "memory(GiB)": 170.14, "train_speed(iter/s)": 0.236511, "epoch": 0.10101010101010101, "step": 20}, {"eval_loss": 0.7290887236595154, "eval_token_acc": 0.7212787212787213, "eval_runtime": 1.8127, "eval_samples_per_second": 2.207, "eval_steps_per_second": 2.207, "epoch": 0.10101010101010101, "step": 20}, {"loss": 0.48354249000549315, "token_acc": 0.8261477045908183, "grad_norm": 0.45920249819755554, "learning_rate": 5e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.214846, "epoch": 0.12626262626262627, "step": 25}, {"loss": 0.5703897476196289, "token_acc": 0.8114154296466652, "grad_norm": 1.1457232236862183, "learning_rate": 6e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.223062, "epoch": 0.15151515151515152, "step": 30}, {"loss": 0.3709995269775391, "token_acc": 0.8511754068716094, "grad_norm": 0.3219285309314728, "learning_rate": 7e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.226934, "epoch": 0.17676767676767677, "step": 35}, {"loss": 0.44092235565185545, "token_acc": 0.8480160435467698, "grad_norm": 0.2930394113063812, "learning_rate": 8e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.226914, "epoch": 0.20202020202020202, "step": 40}, {"eval_loss": 0.5398522615432739, "eval_token_acc": 0.7362637362637363, "eval_runtime": 1.8136, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "epoch": 0.20202020202020202, "step": 40}, {"loss": 0.4420435428619385, "token_acc": 0.83846547314578, "grad_norm": 0.4423171281814575, "learning_rate": 9e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.218055, "epoch": 0.22727272727272727, "step": 45}, {"loss": 0.5247397899627686, "token_acc": 0.8684412312410998, "grad_norm": 0.356607049703598, "learning_rate": 0.0001, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.213301, "epoch": 0.25252525252525254, "step": 50}, {"loss": 0.46730861663818357, "token_acc": 0.8472682119205298, "grad_norm": 0.41687265038490295, "learning_rate": 9.999301905929286e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.21328, "epoch": 0.2777777777777778, "step": 55}, {"loss": 0.36838181018829347, "token_acc": 0.8551724137931035, "grad_norm": 0.7148261666297913, "learning_rate": 9.997207818651274e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.218719, "epoch": 0.30303030303030304, "step": 60}, {"eval_loss": 0.5127567052841187, "eval_token_acc": 0.7422577422577422, "eval_runtime": 1.8179, "eval_samples_per_second": 2.2, "eval_steps_per_second": 2.2, "epoch": 0.30303030303030304, "step": 60}, {"loss": 0.5411728858947754, "token_acc": 0.8298285714285715, "grad_norm": 0.6570205688476562, "learning_rate": 9.99371832291393e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.213853, "epoch": 0.3282828282828283, "step": 65}, {"loss": 0.39851596355438235, "token_acc": 0.8732449297971919, "grad_norm": 0.2901982367038727, "learning_rate": 9.988834393115767e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.211799, "epoch": 0.35353535353535354, "step": 70}, {"loss": 0.5697728157043457, "token_acc": 0.8354898336414048, "grad_norm": 0.31978854537010193, "learning_rate": 9.982557393033758e-05, "memory(GiB)": 179.18, "train_speed(iter/s)": 0.215997, "epoch": 0.3787878787878788, "step": 75}, {"loss": 0.6011258602142334, "token_acc": 0.8381935097951249, "grad_norm": 0.3901304304599762, "learning_rate": 9.974889075442521e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.216474, "epoch": 0.40404040404040403, "step": 80}, {"eval_loss": 0.5226491093635559, "eval_token_acc": 0.7392607392607392, "eval_runtime": 1.8268, "eval_samples_per_second": 2.19, "eval_steps_per_second": 2.19, "epoch": 0.40404040404040403, "step": 80}, {"loss": 0.5449016571044922, "token_acc": 0.8128, "grad_norm": 0.27575281262397766, "learning_rate": 9.965831581624871e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.213104, "epoch": 0.4292929292929293, "step": 85}, {"loss": 0.4397461414337158, "token_acc": 0.8472657610588645, "grad_norm": 0.6381244659423828, "learning_rate": 9.9553874407739e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.214392, "epoch": 0.45454545454545453, "step": 90}, {"loss": 0.3532680034637451, "token_acc": 0.8662573411639082, "grad_norm": 0.3371107280254364, "learning_rate": 9.94355956928673e-05, "memory(GiB)": 179.25, "train_speed(iter/s)": 0.21374, "epoch": 0.4797979797979798, "step": 95}, {"loss": 0.41753764152526857, "token_acc": 0.8640469738030714, "grad_norm": 0.3131839334964752, "learning_rate": 9.930351269950143e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.211078, "epoch": 0.5050505050505051, "step": 100}, {"eval_loss": 0.5102224349975586, "eval_token_acc": 0.7382617382617382, "eval_runtime": 1.8327, "eval_samples_per_second": 2.183, "eval_steps_per_second": 2.183, "epoch": 0.5050505050505051, "step": 100}, {"loss": 0.5459073543548584, "token_acc": 0.8039112050739958, "grad_norm": 0.39832383394241333, "learning_rate": 9.915766231018318e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.210332, "epoch": 0.5303030303030303, "step": 105}, {"loss": 0.5121739864349365, "token_acc": 0.8097281831187411, "grad_norm": 1.300778865814209, "learning_rate": 9.899808525182935e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.214826, "epoch": 0.5555555555555556, "step": 110}, {"loss": 0.451249361038208, "token_acc": 0.8614628614628614, "grad_norm": 0.34517738223075867, "learning_rate": 9.882482608435923e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.211594, "epoch": 0.5808080808080808, "step": 115}, {"loss": 0.5085325717926026, "token_acc": 0.8130574826560951, "grad_norm": 0.7278887033462524, "learning_rate": 9.863793318825186e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.21067, "epoch": 0.6060606060606061, "step": 120}, {"eval_loss": 0.509061336517334, "eval_token_acc": 0.7382617382617382, "eval_runtime": 1.7896, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 0.6060606060606061, "step": 120}, {"loss": 0.4689138889312744, "token_acc": 0.8352281825460368, "grad_norm": 0.38017159700393677, "learning_rate": 9.843745875103627e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.210631, "epoch": 0.6313131313131313, "step": 125}, {"loss": 0.4758878707885742, "token_acc": 0.8449714013346044, "grad_norm": 0.40452879667282104, "learning_rate": 9.822345875271883e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209823, "epoch": 0.6565656565656566, "step": 130}, {"loss": 0.3720943212509155, "token_acc": 0.8728339854667412, "grad_norm": 0.41681820154190063, "learning_rate": 9.799599295015154e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209914, "epoch": 0.6818181818181818, "step": 135}, {"loss": 0.5305635452270507, "token_acc": 0.8278240499739719, "grad_norm": 0.43215978145599365, "learning_rate": 9.775512486034563e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20961, "epoch": 0.7070707070707071, "step": 140}, {"eval_loss": 0.4448011815547943, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.8087, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 0.7070707070707071, "step": 140}, {"loss": 0.34435036182403567, "token_acc": 0.8617477760334903, "grad_norm": 0.5140364766120911, "learning_rate": 9.750092174273521e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.207394, "epoch": 0.7323232323232324, "step": 145}, {"loss": 0.4015669345855713, "token_acc": 0.8720949673967564, "grad_norm": 0.8977436423301697, "learning_rate": 9.723345458039594e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20834, "epoch": 0.7575757575757576, "step": 150}, {"loss": 0.4199058055877686, "token_acc": 0.8565537923278771, "grad_norm": 0.6019405126571655, "learning_rate": 9.69527980602239e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20801, "epoch": 0.7828282828282829, "step": 155}, {"loss": 0.34770309925079346, "token_acc": 0.8795408083031924, "grad_norm": 0.41383394598960876, "learning_rate": 9.665903055208014e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208783, "epoch": 0.8080808080808081, "step": 160}, {"eval_loss": 0.4233054220676422, "eval_token_acc": 0.7442557442557443, "eval_runtime": 1.8491, "eval_samples_per_second": 2.163, "eval_steps_per_second": 2.163, "epoch": 0.8080808080808081, "step": 160}, {"loss": 0.4095714569091797, "token_acc": 0.8456421395601412, "grad_norm": 0.2816776931285858, "learning_rate": 9.635223408690688e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20789, "epoch": 0.8333333333333334, "step": 165}, {"loss": 0.45749435424804685, "token_acc": 0.8532716457369465, "grad_norm": 0.41569507122039795, "learning_rate": 9.603249433382144e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20746, "epoch": 0.8585858585858586, "step": 170}, {"loss": 0.4164144515991211, "token_acc": 0.8524394404640054, "grad_norm": 0.4932589828968048, "learning_rate": 9.569990057619414e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208373, "epoch": 0.8838383838383839, "step": 175}, {"loss": 0.422211742401123, "token_acc": 0.8620525059665871, "grad_norm": 0.3632556200027466, "learning_rate": 9.535454568671704e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208791, "epoch": 0.9090909090909091, "step": 180}, {"eval_loss": 0.45568227767944336, "eval_token_acc": 0.7582417582417582, "eval_runtime": 1.8409, "eval_samples_per_second": 2.173, "eval_steps_per_second": 2.173, "epoch": 0.9090909090909091, "step": 180}, {"loss": 0.49337053298950195, "token_acc": 0.8209519012843113, "grad_norm": 0.4775158166885376, "learning_rate": 9.49965261014704e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.207976, "epoch": 0.9343434343434344, "step": 185}, {"loss": 0.7271251678466797, "token_acc": 0.7965624119470274, "grad_norm": 2.7428700923919678, "learning_rate": 9.462594179299406e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.209678, "epoch": 0.9595959595959596, "step": 190}, {"loss": 0.5494725227355957, "token_acc": 0.8197339246119734, "grad_norm": 0.46910813450813293, "learning_rate": 9.424289624237144e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.208894, "epoch": 0.9848484848484849, "step": 195}, {"loss": 0.4731719970703125, "token_acc": 0.8567099230709457, "grad_norm": 0.2979983389377594, "learning_rate": 9.384749641033359e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20626, "epoch": 1.0101010101010102, "step": 200}, {"eval_loss": 0.499001145362854, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.8407, "eval_samples_per_second": 2.173, "eval_steps_per_second": 2.173, "epoch": 1.0101010101010102, "step": 200}, {"loss": 0.42464404106140136, "token_acc": 0.8373205741626795, "grad_norm": 0.38004475831985474, "learning_rate": 9.343985270739182e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203763, "epoch": 1.0353535353535352, "step": 205}, {"loss": 0.3545402765274048, "token_acc": 0.8774455518641565, "grad_norm": 0.4347917437553406, "learning_rate": 9.302007896300698e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203468, "epoch": 1.0606060606060606, "step": 210}, {"loss": 0.32962794303894044, "token_acc": 0.8891755236817666, "grad_norm": 0.38388073444366455, "learning_rate": 9.25882923938038e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202581, "epoch": 1.0858585858585859, "step": 215}, {"loss": 0.2972090482711792, "token_acc": 0.9002027809965237, "grad_norm": 0.5489069223403931, "learning_rate": 9.214461357083985e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202983, "epoch": 1.1111111111111112, "step": 220}, {"eval_loss": 0.5020791292190552, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.8362, "eval_samples_per_second": 2.178, "eval_steps_per_second": 2.178, "epoch": 1.1111111111111112, "step": 220}, {"loss": 0.42758522033691404, "token_acc": 0.8501907293954456, "grad_norm": 0.602890133857727, "learning_rate": 9.168916638593736e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202184, "epoch": 1.1363636363636362, "step": 225}, {"loss": 0.36005940437316897, "token_acc": 0.8754238800642513, "grad_norm": 0.39421549439430237, "learning_rate": 9.122207801708802e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.200825, "epoch": 1.1616161616161615, "step": 230}, {"loss": 0.15720741748809813, "token_acc": 0.9282419272168119, "grad_norm": 0.46965521574020386, "learning_rate": 9.074347889294016e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202308, "epoch": 1.1868686868686869, "step": 235}, {"loss": 0.3855461120605469, "token_acc": 0.8711264141662568, "grad_norm": 0.7477562427520752, "learning_rate": 9.025350265637815e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202911, "epoch": 1.2121212121212122, "step": 240}, {"eval_loss": 0.5334002375602722, "eval_token_acc": 0.7442557442557443, "eval_runtime": 1.8268, "eval_samples_per_second": 2.19, "eval_steps_per_second": 2.19, "epoch": 1.2121212121212122, "step": 240}, {"loss": 0.24191198348999024, "token_acc": 0.8654323028599769, "grad_norm": 0.5034751892089844, "learning_rate": 8.975228612720416e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203143, "epoch": 1.2373737373737375, "step": 245}, {"loss": 0.3798638105392456, "token_acc": 0.8645948945615982, "grad_norm": 0.7961943745613098, "learning_rate": 8.923996926393305e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204558, "epoch": 1.2626262626262625, "step": 250}, {"loss": 0.3437025547027588, "token_acc": 0.87003341997772, "grad_norm": 0.4535170793533325, "learning_rate": 8.871669512471068e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203403, "epoch": 1.2878787878787878, "step": 255}, {"loss": 0.3248720645904541, "token_acc": 0.8757570513929746, "grad_norm": 0.7094895243644714, "learning_rate": 8.818260982736661e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204064, "epoch": 1.3131313131313131, "step": 260}, {"eval_loss": 0.4627455472946167, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.8037, "eval_samples_per_second": 2.218, "eval_steps_per_second": 2.218, "epoch": 1.3131313131313131, "step": 260}, {"loss": 0.2554438352584839, "token_acc": 0.9007541995200549, "grad_norm": 0.22084304690361023, "learning_rate": 8.763786250861256e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.201941, "epoch": 1.3383838383838385, "step": 265}, {"loss": 0.23369157314300537, "token_acc": 0.910726525017135, "grad_norm": 0.44375061988830566, "learning_rate": 8.708260528239788e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.202564, "epoch": 1.3636363636363638, "step": 270}, {"loss": 0.2531747817993164, "token_acc": 0.9063345966432051, "grad_norm": 0.42193475365638733, "learning_rate": 8.651699319743347e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20219, "epoch": 1.3888888888888888, "step": 275}, {"loss": 0.37001192569732666, "token_acc": 0.8865761157170576, "grad_norm": 0.8283807635307312, "learning_rate": 8.594118419389647e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20285, "epoch": 1.4141414141414141, "step": 280}, {"eval_loss": 0.427340030670166, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.8284, "eval_samples_per_second": 2.188, "eval_steps_per_second": 2.188, "epoch": 1.4141414141414141, "step": 280}, {"loss": 0.18019050359725952, "token_acc": 0.9067105947633085, "grad_norm": 0.7095621824264526, "learning_rate": 8.535533905932738e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203001, "epoch": 1.4393939393939394, "step": 285}, {"loss": 0.33577933311462405, "token_acc": 0.8656873032528857, "grad_norm": 0.7720094323158264, "learning_rate": 8.475962138373213e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.203902, "epoch": 1.4646464646464645, "step": 290}, {"loss": 0.3461789131164551, "token_acc": 0.8515314472761282, "grad_norm": 1.2524478435516357, "learning_rate": 8.415419751390155e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.204926, "epoch": 1.4898989898989898, "step": 295}, {"loss": 0.28853349685668944, "token_acc": 0.8904656319290466, "grad_norm": 2.095057725906372, "learning_rate": 8.353923650696118e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.205093, "epoch": 1.5151515151515151, "step": 300}, {"eval_loss": 0.43511924147605896, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.8124, "eval_samples_per_second": 2.207, "eval_steps_per_second": 2.207, "epoch": 1.5151515151515151, "step": 300}, {"loss": 0.3344187498092651, "token_acc": 0.8720779866706456, "grad_norm": 0.516942024230957, "learning_rate": 8.291491008316409e-05, "memory(GiB)": 194.67, "train_speed(iter/s)": 0.20388, "epoch": 1.5404040404040404, "step": 305}, {"loss": 0.2534762382507324, "token_acc": 0.9083986562150056, "grad_norm": 0.5044359564781189, "learning_rate": 8.228139257794012e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20477, "epoch": 1.5656565656565657, "step": 310}, {"loss": 0.23027021884918214, "token_acc": 0.9053683385579937, "grad_norm": 1.4840720891952515, "learning_rate": 8.163886089321493e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205493, "epoch": 1.5909090909090908, "step": 315}, {"loss": 0.2922214031219482, "token_acc": 0.9086255041886441, "grad_norm": 0.47270119190216064, "learning_rate": 8.098749444801224e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205726, "epoch": 1.6161616161616161, "step": 320}, {"eval_loss": 0.375497043132782, "eval_token_acc": 0.7572427572427572, "eval_runtime": 1.7963, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 1.6161616161616161, "step": 320}, {"loss": 0.2936864376068115, "token_acc": 0.8697649283977303, "grad_norm": 0.5327674746513367, "learning_rate": 8.032747512835337e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.205415, "epoch": 1.6414141414141414, "step": 325}, {"loss": 0.3568688154220581, "token_acc": 0.8953463435556509, "grad_norm": 0.7329817414283752, "learning_rate": 7.965898723646776e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20625, "epoch": 1.6666666666666665, "step": 330}, {"loss": 0.28799734115600584, "token_acc": 0.8915866741953699, "grad_norm": 0.46463268995285034, "learning_rate": 7.898221743932888e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206245, "epoch": 1.691919191919192, "step": 335}, {"loss": 0.19362801313400269, "token_acc": 0.9217057761732852, "grad_norm": 0.9827662110328674, "learning_rate": 7.829735471652978e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207024, "epoch": 1.7171717171717171, "step": 340}, {"eval_loss": 0.36126405000686646, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.7972, "eval_samples_per_second": 2.226, "eval_steps_per_second": 2.226, "epoch": 1.7171717171717171, "step": 340}, {"loss": 0.24427511692047119, "token_acc": 0.9034146341463415, "grad_norm": 0.5373030304908752, "learning_rate": 7.760459030751284e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20636, "epoch": 1.7424242424242424, "step": 345}, {"loss": 0.1736771821975708, "token_acc": 0.9324116743471582, "grad_norm": 1.4386836290359497, "learning_rate": 7.690411765816864e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207166, "epoch": 1.7676767676767677, "step": 350}, {"loss": 0.3541959285736084, "token_acc": 0.8842233999184672, "grad_norm": 0.9116824269294739, "learning_rate": 7.619613236681843e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207241, "epoch": 1.7929292929292928, "step": 355}, {"loss": 0.2200457811355591, "token_acc": 0.9080932784636488, "grad_norm": 0.32813596725463867, "learning_rate": 7.548083212959588e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207214, "epoch": 1.8181818181818183, "step": 360}, {"eval_loss": 0.35305824875831604, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.8366, "eval_samples_per_second": 2.178, "eval_steps_per_second": 2.178, "epoch": 1.8181818181818183, "step": 360}, {"loss": 0.28913445472717286, "token_acc": 0.870794734275963, "grad_norm": 0.9034874439239502, "learning_rate": 7.475841668524268e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207182, "epoch": 1.8434343434343434, "step": 365}, {"loss": 0.29883947372436526, "token_acc": 0.8907584448693435, "grad_norm": 0.37799835205078125, "learning_rate": 7.402908775933419e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207123, "epoch": 1.8686868686868687, "step": 370}, {"loss": 0.34903314113616946, "token_acc": 0.8823183635081119, "grad_norm": 0.5515828132629395, "learning_rate": 7.329304900794991e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206817, "epoch": 1.893939393939394, "step": 375}, {"loss": 0.3167442321777344, "token_acc": 0.8950437317784257, "grad_norm": 0.7102660536766052, "learning_rate": 7.255050596080509e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206935, "epoch": 1.9191919191919191, "step": 380}, {"eval_loss": 0.31274980306625366, "eval_token_acc": 0.7632367632367633, "eval_runtime": 1.8255, "eval_samples_per_second": 2.191, "eval_steps_per_second": 2.191, "epoch": 1.9191919191919191, "step": 380}, {"loss": 0.33083691596984866, "token_acc": 0.8845164609053497, "grad_norm": 0.24322476983070374, "learning_rate": 7.180166596385914e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206393, "epoch": 1.9444444444444444, "step": 385}, {"loss": 0.23392996788024903, "token_acc": 0.9032924310533349, "grad_norm": 0.3991744816303253, "learning_rate": 7.104673812141675e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206252, "epoch": 1.9696969696969697, "step": 390}, {"loss": 0.19444363117218016, "token_acc": 0.9170926872638364, "grad_norm": 0.38407906889915466, "learning_rate": 7.02859332377382e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206907, "epoch": 1.9949494949494948, "step": 395}, {"loss": 0.17200204133987426, "token_acc": 0.9564939219449776, "grad_norm": 0.5186311602592468, "learning_rate": 6.951946375817474e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207883, "epoch": 2.0202020202020203, "step": 400}, {"eval_loss": 0.3031849265098572, "eval_token_acc": 0.7542457542457542, "eval_runtime": 1.7906, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 2.0202020202020203, "step": 400}, {"loss": 0.12331185340881348, "token_acc": 0.9200656994251301, "grad_norm": 0.6981443166732788, "learning_rate": 6.874754370984606e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207599, "epoch": 2.0454545454545454, "step": 405}, {"loss": 0.12755507230758667, "token_acc": 0.9337340775726349, "grad_norm": 0.9956308603286743, "learning_rate": 6.797038864187564e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207712, "epoch": 2.0707070707070705, "step": 410}, {"loss": 0.12629324197769165, "token_acc": 0.9433671220802116, "grad_norm": 0.5786072611808777, "learning_rate": 6.718821556520151e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208254, "epoch": 2.095959595959596, "step": 415}, {"loss": 0.08971643447875977, "token_acc": 0.9842690534309737, "grad_norm": 0.40014225244522095, "learning_rate": 6.640124289197845e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208984, "epoch": 2.121212121212121, "step": 420}, {"eval_loss": 0.2971835434436798, "eval_token_acc": 0.7622377622377622, "eval_runtime": 1.8393, "eval_samples_per_second": 2.175, "eval_steps_per_second": 2.175, "epoch": 2.121212121212121, "step": 420}, {"loss": 0.1467829942703247, "token_acc": 0.9118694362017804, "grad_norm": 0.645484209060669, "learning_rate": 6.560969037458933e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208889, "epoch": 2.1464646464646466, "step": 425}, {"loss": 0.08001596331596375, "token_acc": 0.9746255724718406, "grad_norm": 0.3015040159225464, "learning_rate": 6.481377904428171e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208643, "epoch": 2.1717171717171717, "step": 430}, {"loss": 0.12085707187652588, "token_acc": 0.9670248240088922, "grad_norm": 0.3580942451953888, "learning_rate": 6.401373114944781e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20794, "epoch": 2.196969696969697, "step": 435}, {"loss": 0.12868813276290894, "token_acc": 0.9537296229211263, "grad_norm": 0.8356326222419739, "learning_rate": 6.320977009356431e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208265, "epoch": 2.2222222222222223, "step": 440}, {"eval_loss": 0.3201369047164917, "eval_token_acc": 0.7542457542457542, "eval_runtime": 1.8482, "eval_samples_per_second": 2.164, "eval_steps_per_second": 2.164, "epoch": 2.2222222222222223, "step": 440}, {"loss": 0.059886491298675536, "token_acc": 0.9480653040236534, "grad_norm": 0.3740726113319397, "learning_rate": 6.240212037280966e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207886, "epoch": 2.2474747474747474, "step": 445}, {"loss": 0.19066305160522462, "token_acc": 0.9332333083270817, "grad_norm": 0.8616650104522705, "learning_rate": 6.159100751337642e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208264, "epoch": 2.2727272727272725, "step": 450}, {"loss": 0.09232727885246277, "token_acc": 0.9671393509680938, "grad_norm": 0.34099313616752625, "learning_rate": 6.077665800849568e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208355, "epoch": 2.297979797979798, "step": 455}, {"loss": 0.11596425771713256, "token_acc": 0.9609312709296763, "grad_norm": 0.35433682799339294, "learning_rate": 5.99592992551918e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208578, "epoch": 2.323232323232323, "step": 460}, {"eval_loss": 0.3214629888534546, "eval_token_acc": 0.7582417582417582, "eval_runtime": 1.8025, "eval_samples_per_second": 2.219, "eval_steps_per_second": 2.219, "epoch": 2.323232323232323, "step": 460}, {"loss": 0.11863926649093628, "token_acc": 0.9122145401215168, "grad_norm": 0.5267307758331299, "learning_rate": 5.913915949078452e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208792, "epoch": 2.3484848484848486, "step": 465}, {"loss": 0.07829801440238952, "token_acc": 0.9680500284252416, "grad_norm": 0.7042592167854309, "learning_rate": 5.831646772915651e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208441, "epoch": 2.3737373737373737, "step": 470}, {"loss": 0.0904280424118042, "token_acc": 0.9607805987116332, "grad_norm": 0.8526184558868408, "learning_rate": 5.749145369680407e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208835, "epoch": 2.398989898989899, "step": 475}, {"loss": 0.17373031377792358, "token_acc": 0.942286629033617, "grad_norm": 0.3466002345085144, "learning_rate": 5.666434776868895e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207896, "epoch": 2.4242424242424243, "step": 480}, {"eval_loss": 0.3141739070415497, "eval_token_acc": 0.7582417582417582, "eval_runtime": 1.7942, "eval_samples_per_second": 2.229, "eval_steps_per_second": 2.229, "epoch": 2.4242424242424243, "step": 480}, {"loss": 0.13746129274368285, "token_acc": 0.9038551951104843, "grad_norm": 0.9034782648086548, "learning_rate": 5.583538090390882e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208273, "epoch": 2.4494949494949494, "step": 485}, {"loss": 0.1857938289642334, "token_acc": 0.9280701754385965, "grad_norm": 0.5720848441123962, "learning_rate": 5.5004784581204927e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208224, "epoch": 2.474747474747475, "step": 490}, {"loss": 0.11083965301513672, "token_acc": 0.9560761346998536, "grad_norm": 0.35935673117637634, "learning_rate": 5.41727907343245e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208702, "epoch": 2.5, "step": 495}, {"loss": 0.1741043210029602, "token_acc": 0.9156242021955578, "grad_norm": 0.547448992729187, "learning_rate": 5.3339631687256084e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208566, "epoch": 2.525252525252525, "step": 500}, {"eval_loss": 0.3121967911720276, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.8195, "eval_samples_per_second": 2.198, "eval_steps_per_second": 2.198, "epoch": 2.525252525252525, "step": 500}, {"loss": 0.09475700855255127, "token_acc": 0.9292557111274871, "grad_norm": 0.5953697562217712, "learning_rate": 5.250554008935596e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208436, "epoch": 2.5505050505050506, "step": 505}, {"loss": 0.10801750421524048, "token_acc": 0.9673539518900344, "grad_norm": 0.6027724146842957, "learning_rate": 5.167074885038373e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208985, "epoch": 2.5757575757575757, "step": 510}, {"loss": 0.1285596013069153, "token_acc": 0.9517408906882591, "grad_norm": 0.3738398849964142, "learning_rate": 5.0835491075465045e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209255, "epoch": 2.601010101010101, "step": 515}, {"loss": 0.13147177696228027, "token_acc": 0.9425985953538628, "grad_norm": 0.43995335698127747, "learning_rate": 5e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209221, "epoch": 2.6262626262626263, "step": 520}, {"eval_loss": 0.3112495243549347, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.8299, "eval_samples_per_second": 2.186, "eval_steps_per_second": 2.186, "epoch": 2.6262626262626263, "step": 520}, {"loss": 0.08995423913002014, "token_acc": 0.9309090909090909, "grad_norm": 0.45797106623649597, "learning_rate": 4.916450892453495e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.209276, "epoch": 2.6515151515151514, "step": 525}, {"loss": 0.15920201539993287, "token_acc": 0.9442068067695741, "grad_norm": 0.3859306871891022, "learning_rate": 4.832925114961629e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208661, "epoch": 2.676767676767677, "step": 530}, {"loss": 0.10588231086730956, "token_acc": 0.965555432495293, "grad_norm": 0.9083346724510193, "learning_rate": 4.749445991064404e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.208205, "epoch": 2.702020202020202, "step": 535}, {"loss": 0.21194179058074952, "token_acc": 0.9197012138188608, "grad_norm": 0.9049814343452454, "learning_rate": 4.666036831274392e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207772, "epoch": 2.7272727272727275, "step": 540}, {"eval_loss": 0.3101465106010437, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.805, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 2.7272727272727275, "step": 540}, {"loss": 0.15268012285232543, "token_acc": 0.9091122592766557, "grad_norm": 0.5595670342445374, "learning_rate": 4.582720926567552e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207444, "epoch": 2.7525252525252526, "step": 545}, {"loss": 0.10676953792572022, "token_acc": 0.9608655616942909, "grad_norm": 0.2707825005054474, "learning_rate": 4.4995215418795085e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207501, "epoch": 2.7777777777777777, "step": 550}, {"loss": 0.0965248167514801, "token_acc": 0.9671724992257665, "grad_norm": 0.5363113880157471, "learning_rate": 4.416461909609119e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207753, "epoch": 2.8030303030303028, "step": 555}, {"loss": 0.1195767879486084, "token_acc": 0.9444240869671431, "grad_norm": 0.32662245631217957, "learning_rate": 4.333565223131107e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.20762, "epoch": 2.8282828282828283, "step": 560}, {"eval_loss": 0.30127066373825073, "eval_token_acc": 0.7572427572427572, "eval_runtime": 1.8237, "eval_samples_per_second": 2.193, "eval_steps_per_second": 2.193, "epoch": 2.8282828282828283, "step": 560}, {"loss": 0.14267081022262573, "token_acc": 0.9234961000106849, "grad_norm": 0.6958869695663452, "learning_rate": 4.250854630319593e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.207194, "epoch": 2.8535353535353534, "step": 565}, {"loss": 0.19810810089111328, "token_acc": 0.9246951219512195, "grad_norm": 0.6756062507629395, "learning_rate": 4.1683532270843504e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206849, "epoch": 2.878787878787879, "step": 570}, {"loss": 0.1045069694519043, "token_acc": 0.9572841133816744, "grad_norm": 0.8443397283554077, "learning_rate": 4.0860840509215496e-05, "memory(GiB)": 194.68, "train_speed(iter/s)": 0.206796, "epoch": 2.904040404040404, "step": 575}, {"loss": 0.15350983142852784, "token_acc": 0.9409324377716317, "grad_norm": 3.983555793762207, "learning_rate": 4.0040700744808204e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207187, "epoch": 2.929292929292929, "step": 580}, {"eval_loss": 0.3041973114013672, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.8171, "eval_samples_per_second": 2.201, "eval_steps_per_second": 2.201, "epoch": 2.929292929292929, "step": 580}, {"loss": 0.14068719148635864, "token_acc": 0.9072411729503291, "grad_norm": 0.7293962240219116, "learning_rate": 3.922334199150432e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207381, "epoch": 2.9545454545454546, "step": 585}, {"loss": 0.1358010172843933, "token_acc": 0.9516318887105404, "grad_norm": 0.6762995719909668, "learning_rate": 3.840899248662358e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206957, "epoch": 2.9797979797979797, "step": 590}, {"loss": 0.12651437520980835, "token_acc": 0.9631662269129287, "grad_norm": 0.22191222012043, "learning_rate": 3.7597879627190334e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206604, "epoch": 3.005050505050505, "step": 595}, {"loss": 0.05826301574707031, "token_acc": 0.9831535392345204, "grad_norm": 0.5518550276756287, "learning_rate": 3.6790229906435705e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206751, "epoch": 3.0303030303030303, "step": 600}, {"eval_loss": 0.3032275140285492, "eval_token_acc": 0.7522477522477522, "eval_runtime": 1.788, "eval_samples_per_second": 2.237, "eval_steps_per_second": 2.237, "epoch": 3.0303030303030303, "step": 600}, {"loss": 0.0395317792892456, "token_acc": 0.9590757783434501, "grad_norm": 0.6190295219421387, "learning_rate": 3.598626885055219e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206057, "epoch": 3.0555555555555554, "step": 605}, {"loss": 0.021629127860069274, "token_acc": 0.9906427990235964, "grad_norm": 0.25448670983314514, "learning_rate": 3.5186220955718306e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206434, "epoch": 3.080808080808081, "step": 610}, {"loss": 0.012175245583057404, "token_acc": 0.9950428120775124, "grad_norm": 0.06814919412136078, "learning_rate": 3.4390309625410686e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206853, "epoch": 3.106060606060606, "step": 615}, {"loss": 0.03373536169528961, "token_acc": 0.9886826618379357, "grad_norm": 0.41807010769844055, "learning_rate": 3.3598757108021546e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206723, "epoch": 3.1313131313131315, "step": 620}, {"eval_loss": 0.31177499890327454, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.8259, "eval_samples_per_second": 2.191, "eval_steps_per_second": 2.191, "epoch": 3.1313131313131315, "step": 620}, {"loss": 0.03276803493499756, "token_acc": 0.9653735489631908, "grad_norm": 0.4315682351589203, "learning_rate": 3.281178443479852e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206122, "epoch": 3.1565656565656566, "step": 625}, {"loss": 0.03847094178199768, "token_acc": 0.9868200836820084, "grad_norm": 0.3409155309200287, "learning_rate": 3.202961135812437e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206497, "epoch": 3.1818181818181817, "step": 630}, {"loss": 0.07642163634300232, "token_acc": 0.9701730418943534, "grad_norm": 0.822550356388092, "learning_rate": 3.1252456290153954e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206299, "epoch": 3.207070707070707, "step": 635}, {"loss": 0.06662976741790771, "token_acc": 0.9696140693698094, "grad_norm": 0.39569079875946045, "learning_rate": 3.0480536241825263e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205891, "epoch": 3.2323232323232323, "step": 640}, {"eval_loss": 0.3040866553783417, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.796, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 3.2323232323232323, "step": 640}, {"loss": 0.055225080251693724, "token_acc": 0.9549541529422239, "grad_norm": 0.5110739469528198, "learning_rate": 2.9714066762261823e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20523, "epoch": 3.257575757575758, "step": 645}, {"loss": 0.05822249054908753, "token_acc": 0.9828406388039416, "grad_norm": 0.30346569418907166, "learning_rate": 2.895326187858326e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205468, "epoch": 3.282828282828283, "step": 650}, {"loss": 0.026796561479568482, "token_acc": 0.992970946579194, "grad_norm": 0.39017254114151, "learning_rate": 2.8198334036140874e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205582, "epoch": 3.308080808080808, "step": 655}, {"loss": 0.045568430423736574, "token_acc": 0.9833729216152018, "grad_norm": 0.2839377522468567, "learning_rate": 2.74494940391949e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.2055, "epoch": 3.3333333333333335, "step": 660}, {"eval_loss": 0.3058268129825592, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.7988, "eval_samples_per_second": 2.224, "eval_steps_per_second": 2.224, "epoch": 3.3333333333333335, "step": 660}, {"loss": 0.04427232146263123, "token_acc": 0.9405374499714122, "grad_norm": 0.7259745597839355, "learning_rate": 2.6706950992050094e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205656, "epoch": 3.3585858585858586, "step": 665}, {"loss": 0.044158649444580075, "token_acc": 0.9826169405815424, "grad_norm": 0.6911581754684448, "learning_rate": 2.5970912240665813e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205791, "epoch": 3.3838383838383836, "step": 670}, {"loss": 0.038330867886543274, "token_acc": 0.9843804843804844, "grad_norm": 0.36632707715034485, "learning_rate": 2.5241583314757327e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205991, "epoch": 3.409090909090909, "step": 675}, {"loss": 0.05873996019363403, "token_acc": 0.974401170232218, "grad_norm": 0.4337018132209778, "learning_rate": 2.4519167870404125e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206296, "epoch": 3.4343434343434343, "step": 680}, {"eval_loss": 0.3051184415817261, "eval_token_acc": 0.7512487512487512, "eval_runtime": 1.8159, "eval_samples_per_second": 2.203, "eval_steps_per_second": 2.203, "epoch": 3.4343434343434343, "step": 680}, {"loss": 0.07444831728935242, "token_acc": 0.9341205032001766, "grad_norm": 0.38646677136421204, "learning_rate": 2.3803867633181574e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205939, "epoch": 3.45959595959596, "step": 685}, {"loss": 0.05726301074028015, "token_acc": 0.9722510074841682, "grad_norm": 0.934688925743103, "learning_rate": 2.3095882341831372e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205757, "epoch": 3.484848484848485, "step": 690}, {"loss": 0.059635859727859494, "token_acc": 0.9783371472158657, "grad_norm": 0.5144165754318237, "learning_rate": 2.2395409692487175e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205924, "epoch": 3.51010101010101, "step": 695}, {"loss": 0.009928755462169647, "token_acc": 0.9956945388624519, "grad_norm": 0.23240762948989868, "learning_rate": 2.1702645283470236e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206296, "epoch": 3.5353535353535355, "step": 700}, {"eval_loss": 0.30824291706085205, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8346, "eval_samples_per_second": 2.18, "eval_steps_per_second": 2.18, "epoch": 3.5353535353535355, "step": 700}, {"loss": 0.036235207319259645, "token_acc": 0.9407323518308796, "grad_norm": 0.35060033202171326, "learning_rate": 2.1017782560671123e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206441, "epoch": 3.5606060606060606, "step": 705}, {"loss": 0.031028282642364503, "token_acc": 0.9893070552468812, "grad_norm": 0.4275978207588196, "learning_rate": 2.0341012763532243e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206201, "epoch": 3.5858585858585856, "step": 710}, {"loss": 0.002437155693769455, "token_acc": 0.9986950848194868, "grad_norm": 0.2053557187318802, "learning_rate": 1.967252487164663e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206895, "epoch": 3.611111111111111, "step": 715}, {"loss": 0.01395910233259201, "token_acc": 0.9963244012331041, "grad_norm": 0.7973630428314209, "learning_rate": 1.9012505551987765e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206763, "epoch": 3.6363636363636362, "step": 720}, {"eval_loss": 0.31287190318107605, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.8131, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "epoch": 3.6363636363636362, "step": 720}, {"loss": 0.03765462040901184, "token_acc": 0.9560504300695181, "grad_norm": 0.45141226053237915, "learning_rate": 1.836113910678507e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206518, "epoch": 3.6616161616161618, "step": 725}, {"loss": 0.05053359270095825, "token_acc": 0.9809334657398212, "grad_norm": 0.2863880693912506, "learning_rate": 1.771860742205988e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206217, "epoch": 3.686868686868687, "step": 730}, {"loss": 0.030423933267593385, "token_acc": 0.9856907686463214, "grad_norm": 0.37708166241645813, "learning_rate": 1.7085089916835923e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206105, "epoch": 3.712121212121212, "step": 735}, {"loss": 0.04238354861736297, "token_acc": 0.9868162140889414, "grad_norm": 0.5678088068962097, "learning_rate": 1.646076349303884e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206402, "epoch": 3.7373737373737375, "step": 740}, {"eval_loss": 0.3119312822818756, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.8083, "eval_samples_per_second": 2.212, "eval_steps_per_second": 2.212, "epoch": 3.7373737373737375, "step": 740}, {"loss": 0.034655985236167905, "token_acc": 0.9208163265306123, "grad_norm": 0.3441520035266876, "learning_rate": 1.584580248609846e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206715, "epoch": 3.7626262626262625, "step": 745}, {"loss": 0.03131322860717774, "token_acc": 0.9910037878787878, "grad_norm": 0.7808462381362915, "learning_rate": 1.5240378616267886e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207112, "epoch": 3.787878787878788, "step": 750}, {"loss": 0.10859463214874268, "token_acc": 0.9532800955295171, "grad_norm": 0.4915235638618469, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206346, "epoch": 3.813131313131313, "step": 755}, {"loss": 0.01086568683385849, "token_acc": 0.997327632282202, "grad_norm": 0.36231735348701477, "learning_rate": 1.4058815806103542e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206831, "epoch": 3.8383838383838382, "step": 760}, {"eval_loss": 0.30831077694892883, "eval_token_acc": 0.7542457542457542, "eval_runtime": 1.7998, "eval_samples_per_second": 2.222, "eval_steps_per_second": 2.222, "epoch": 3.8383838383838382, "step": 760}, {"loss": 0.055282962322235105, "token_acc": 0.9400766283524904, "grad_norm": 0.4630831182003021, "learning_rate": 1.3483006802566544e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206812, "epoch": 3.8636363636363638, "step": 765}, {"loss": 0.028864413499832153, "token_acc": 0.989685597116938, "grad_norm": 0.10027548670768738, "learning_rate": 1.2917394717602121e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206816, "epoch": 3.888888888888889, "step": 770}, {"loss": 0.03930847942829132, "token_acc": 0.9889024019458802, "grad_norm": 0.4784111976623535, "learning_rate": 1.2362137491387432e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206798, "epoch": 3.9141414141414144, "step": 775}, {"loss": 0.04449716806411743, "token_acc": 0.9804500703234881, "grad_norm": 0.4522968530654907, "learning_rate": 1.1817390172633403e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206775, "epoch": 3.9393939393939394, "step": 780}, {"eval_loss": 0.3084229528903961, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.8372, "eval_samples_per_second": 2.177, "eval_steps_per_second": 2.177, "epoch": 3.9393939393939394, "step": 780}, {"loss": 0.03509677648544311, "token_acc": 0.9363057324840764, "grad_norm": 1.1634060144424438, "learning_rate": 1.1283304875289336e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206843, "epoch": 3.9646464646464645, "step": 785}, {"loss": 0.04887769818305969, "token_acc": 0.9847414070028911, "grad_norm": 0.5749446749687195, "learning_rate": 1.0760030736066951e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207005, "epoch": 3.98989898989899, "step": 790}, {"loss": 0.02856588065624237, "token_acc": 0.9900847593155285, "grad_norm": 0.2582509219646454, "learning_rate": 1.024771387279585e-05, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20717, "epoch": 4.015151515151516, "step": 795}, {"loss": 0.00884537547826767, "token_acc": 0.9983554885569412, "grad_norm": 0.3403402268886566, "learning_rate": 9.746497343621857e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207207, "epoch": 4.040404040404041, "step": 800}, {"eval_loss": 0.30910566449165344, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.7909, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 4.040404040404041, "step": 800}, {"loss": 0.01580266058444977, "token_acc": 0.9634023220595659, "grad_norm": 0.3678455650806427, "learning_rate": 9.256521107059834e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206982, "epoch": 4.065656565656566, "step": 805}, {"loss": 0.002520921640098095, "token_acc": 0.9998035749361619, "grad_norm": 0.02392689324915409, "learning_rate": 8.777921982911996e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207283, "epoch": 4.090909090909091, "step": 810}, {"loss": 0.01393090933561325, "token_acc": 0.9945280437756497, "grad_norm": 0.29595932364463806, "learning_rate": 8.310833614062651e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207408, "epoch": 4.116161616161616, "step": 815}, {"loss": 0.007520098239183426, "token_acc": 0.9984389634717452, "grad_norm": 0.033545322716236115, "learning_rate": 7.85538642916015e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207412, "epoch": 4.141414141414141, "step": 820}, {"eval_loss": 0.312429279088974, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.8138, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "epoch": 4.141414141414141, "step": 820}, {"loss": 0.020593562722206117, "token_acc": 0.9678671554884624, "grad_norm": 0.2401837557554245, "learning_rate": 7.4117076061961885e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207046, "epoch": 4.166666666666667, "step": 825}, {"loss": 0.051178747415542604, "token_acc": 0.9609134826526131, "grad_norm": 0.3309480845928192, "learning_rate": 6.979921036993042e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207063, "epoch": 4.191919191919192, "step": 830}, {"loss": 0.0256537526845932, "token_acc": 0.9917600102999872, "grad_norm": 0.37649092078208923, "learning_rate": 6.5601472926081766e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207066, "epoch": 4.217171717171717, "step": 835}, {"loss": 0.02737319767475128, "token_acc": 0.9857651245551602, "grad_norm": 0.34800857305526733, "learning_rate": 6.152503589666425e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206927, "epoch": 4.242424242424242, "step": 840}, {"eval_loss": 0.3184429109096527, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.7906, "eval_samples_per_second": 2.234, "eval_steps_per_second": 2.234, "epoch": 4.242424242424242, "step": 840}, {"loss": 0.06719566583633423, "token_acc": 0.9502145922746781, "grad_norm": 0.7228756546974182, "learning_rate": 5.757103757628573e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206416, "epoch": 4.267676767676767, "step": 845}, {"loss": 0.04835307598114014, "token_acc": 0.981879262954291, "grad_norm": 0.38411420583724976, "learning_rate": 5.374058207005944e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206169, "epoch": 4.292929292929293, "step": 850}, {"loss": 0.012648317217826843, "token_acc": 0.9956322341122515, "grad_norm": 0.1029912531375885, "learning_rate": 5.0034738985296095e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20603, "epoch": 4.318181818181818, "step": 855}, {"loss": 0.012134979665279388, "token_acc": 0.9967785234899329, "grad_norm": 0.24367812275886536, "learning_rate": 4.645454313282965e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.205974, "epoch": 4.343434343434343, "step": 860}, {"eval_loss": 0.32311469316482544, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.7858, "eval_samples_per_second": 2.24, "eval_steps_per_second": 2.24, "epoch": 4.343434343434343, "step": 860}, {"loss": 0.010147520154714585, "token_acc": 0.9645875251509054, "grad_norm": 0.18834778666496277, "learning_rate": 4.3000994238058644e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.20586, "epoch": 4.3686868686868685, "step": 865}, {"loss": 0.008408330380916595, "token_acc": 0.9981830194912454, "grad_norm": 0.1570287048816681, "learning_rate": 3.967505666178556e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206013, "epoch": 4.393939393939394, "step": 870}, {"loss": 0.006087615713477134, "token_acc": 0.996905820924386, "grad_norm": 0.15527097880840302, "learning_rate": 3.647765913093132e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206279, "epoch": 4.41919191919192, "step": 875}, {"loss": 0.01747373789548874, "token_acc": 0.9946977730646872, "grad_norm": 0.49585437774658203, "learning_rate": 3.340969447919873e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206368, "epoch": 4.444444444444445, "step": 880}, {"eval_loss": 0.32524874806404114, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8805, "eval_samples_per_second": 2.127, "eval_steps_per_second": 2.127, "epoch": 4.444444444444445, "step": 880}, {"loss": 0.00792773962020874, "token_acc": 0.9659295448701466, "grad_norm": 0.2330419272184372, "learning_rate": 3.0472019397761064e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206247, "epoch": 4.46969696969697, "step": 885}, {"loss": 0.011314756423234939, "token_acc": 0.9967257844474762, "grad_norm": 0.13684527575969696, "learning_rate": 2.7665454196040664e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206195, "epoch": 4.494949494949495, "step": 890}, {"loss": 0.021187099814414977, "token_acc": 0.9942469693856585, "grad_norm": 0.3359576463699341, "learning_rate": 2.4990782572647975e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206448, "epoch": 4.52020202020202, "step": 895}, {"loss": 0.03609364330768585, "token_acc": 0.9888475836431226, "grad_norm": 0.28014275431632996, "learning_rate": 2.2448751396543787e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206265, "epoch": 4.545454545454545, "step": 900}, {"eval_loss": 0.32550811767578125, "eval_token_acc": 0.7472527472527473, "eval_runtime": 1.8451, "eval_samples_per_second": 2.168, "eval_steps_per_second": 2.168, "epoch": 4.545454545454545, "step": 900}, {"loss": 0.011403033882379532, "token_acc": 0.9590459752079084, "grad_norm": 1.3766915798187256, "learning_rate": 2.004007049848461e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206299, "epoch": 4.570707070707071, "step": 905}, {"loss": 0.006288837641477585, "token_acc": 0.9984427718660783, "grad_norm": 0.06532011181116104, "learning_rate": 1.7765412472811771e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206688, "epoch": 4.595959595959596, "step": 910}, {"loss": 0.017497456073760985, "token_acc": 0.9928386408654579, "grad_norm": 0.021015260368585587, "learning_rate": 1.5625412489637337e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206791, "epoch": 4.621212121212121, "step": 915}, {"loss": 0.004374519735574722, "token_acc": 0.9980553154710458, "grad_norm": 0.11464820802211761, "learning_rate": 1.3620668117481472e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207068, "epoch": 4.646464646464646, "step": 920}, {"eval_loss": 0.32665225863456726, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.808, "eval_samples_per_second": 2.212, "eval_steps_per_second": 2.212, "epoch": 4.646464646464646, "step": 920}, {"loss": 0.003269971534609795, "token_acc": 0.9575136386179534, "grad_norm": 0.42058226466178894, "learning_rate": 1.1751739156407649e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207081, "epoch": 4.671717171717171, "step": 925}, {"loss": 0.015348337590694427, "token_acc": 0.9930389817024662, "grad_norm": 0.008930183947086334, "learning_rate": 1.0019147481706625e-06, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207309, "epoch": 4.696969696969697, "step": 930}, {"loss": 0.021287491917610167, "token_acc": 0.9904045899693342, "grad_norm": 0.42432549595832825, "learning_rate": 8.423376898168245e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207003, "epoch": 4.722222222222222, "step": 935}, {"loss": 0.015300212800502777, "token_acc": 0.9938466655579578, "grad_norm": 0.09640122205018997, "learning_rate": 6.964873004985717e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207138, "epoch": 4.747474747474747, "step": 940}, {"eval_loss": 0.3267664313316345, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.8049, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 4.747474747474747, "step": 940}, {"loss": 0.012129424512386322, "token_acc": 0.9729514717581543, "grad_norm": 0.09156661480665207, "learning_rate": 5.644043071326932e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206551, "epoch": 4.7727272727272725, "step": 945}, {"loss": 0.009703928232192993, "token_acc": 0.9948709729123257, "grad_norm": 0.014584671705961227, "learning_rate": 4.461255922609986e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206615, "epoch": 4.797979797979798, "step": 950}, {"loss": 0.0038586195558309557, "token_acc": 0.9988066825775657, "grad_norm": 0.09790431708097458, "learning_rate": 3.416841837512952e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206618, "epoch": 4.8232323232323235, "step": 955}, {"loss": 0.023634913563728332, "token_acc": 0.9918020343100046, "grad_norm": 0.36749953031539917, "learning_rate": 2.511092455747932e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206703, "epoch": 4.848484848484849, "step": 960}, {"eval_loss": 0.327303946018219, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8393, "eval_samples_per_second": 2.175, "eval_steps_per_second": 2.175, "epoch": 4.848484848484849, "step": 960}, {"loss": 0.014398331940174102, "token_acc": 0.9552456623966272, "grad_norm": 0.21629047393798828, "learning_rate": 1.7442606966242004e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206714, "epoch": 4.873737373737374, "step": 965}, {"loss": 0.00892629474401474, "token_acc": 0.9974954082484555, "grad_norm": 0.2632206678390503, "learning_rate": 1.1165606884234181e-07, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206903, "epoch": 4.898989898989899, "step": 970}, {"loss": 0.006720826029777527, "token_acc": 0.9985783915515841, "grad_norm": 0.14011166989803314, "learning_rate": 6.281677086071303e-08, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207185, "epoch": 4.924242424242424, "step": 975}, {"loss": 0.03538868129253388, "token_acc": 0.9834126862233143, "grad_norm": 0.07906040549278259, "learning_rate": 2.792181348726941e-08, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207226, "epoch": 4.94949494949495, "step": 980}, {"eval_loss": 0.3269588053226471, "eval_token_acc": 0.7472527472527473, "eval_runtime": 1.7898, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 4.94949494949495, "step": 980}, {"loss": 0.06200398206710815, "token_acc": 0.9508980763099161, "grad_norm": 0.30053889751434326, "learning_rate": 6.980940707146389e-09, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.206925, "epoch": 4.974747474747475, "step": 985}, {"loss": 0.004158956184983254, "token_acc": 0.999026921829387, "grad_norm": 0.19153615832328796, "learning_rate": 0.0, "memory(GiB)": 194.69, "train_speed(iter/s)": 0.207036, "epoch": 5.0, "step": 990}, {"eval_loss": 0.32688042521476746, "eval_token_acc": 0.7472527472527473, "eval_runtime": 1.8656, "eval_samples_per_second": 2.144, "eval_steps_per_second": 2.144, "epoch": 5.0, "step": 990}, {"train_runtime": 4786.0578, "train_samples_per_second": 0.414, "train_steps_per_second": 0.207, "total_flos": 6.384415932845261e+17, "train_loss": 0.19521468806202816, "epoch": 5.0, "step": 990}], "memory": 194.69140625} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs/events.out.tfevents.1737743586.kml-task-547024-record-9965643-prod-worker-0.81411.0 b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs/events.out.tfevents.1737743586.kml-task-547024-record-9965643-prod-worker-0.81411.0 new file mode 100644 index 0000000000000000000000000000000000000000..31232225289240ca20564b3e63252321ede79000 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_random20/v0-20250124-183105/runs/events.out.tfevents.1737743586.kml-task-547024-record-9965643-prod-worker-0.81411.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dbf32ecf9c985ef17363725572b82442fe92e94d0f7d724df0039eb749eb8e6 +size 99071 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4d974a37c655ea9a43cc737104f9f8b6bd1bdd8d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/README.md b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4678d8c7e244c928931ecfbe2de2ca148cb81822 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "k_proj", + "v_proj", + "up_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4fe87ad02da091444f4008f1ee0c50c94cb90ef7 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2d3ff3c72700008691431d27ae02e9434ee7c7ee0b6ed49f822c1383352dab +size 828526568 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/additional_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4d974a37c655ea9a43cc737104f9f8b6bd1bdd8d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/optimizer.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..62eb4be2ab18587f45ad98cc9418d5ba5324b961 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20154d3d047e069f97ad8acd27d0c8f3cf539c49f6fc51707c7145c3317ef5d +size 1657698290 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/rng_state.pth b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c790bba2817e4b9e355863149b620eca3999422 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f23818629bbee20a2adf6cb5c27b27c121d83c90c47c043eb47588f4abd4339f +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/scheduler.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9d46ff6665a5c0006e41ea0f4c239e674e5a56a --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8338b77329852142ffeced3aceb82c58a3841625a48484d0d601e883285586b +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/trainer_state.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7138b0dde815035f778bb64a9ce5713a0e99ceca --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/trainer_state.json @@ -0,0 +1,1415 @@ +{ + "best_metric": 0.28748947, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560", + "epoch": 2.8282828282828283, + "eval_steps": 20, + "global_step": 560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.5212649703025818, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5845019221305847, + "memory(GiB)": 143.92, + "step": 1, + "token_acc": 0.8486238532110092, + "train_speed(iter/s)": 0.152386 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.8231858015060425, + "learning_rate": 1e-05, + "loss": 0.7612156867980957, + "memory(GiB)": 153.24, + "step": 5, + "token_acc": 0.8290306867998052, + "train_speed(iter/s)": 0.221696 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.613670825958252, + "learning_rate": 2e-05, + "loss": 0.8103227615356445, + "memory(GiB)": 160.15, + "step": 10, + "token_acc": 0.7863496684457383, + "train_speed(iter/s)": 0.228033 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.5065694451332092, + "learning_rate": 3e-05, + "loss": 0.7224256038665772, + "memory(GiB)": 169.88, + "step": 15, + "token_acc": 0.8002382843526609, + "train_speed(iter/s)": 0.222881 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 5.616745471954346, + "learning_rate": 4e-05, + "loss": 0.783719539642334, + "memory(GiB)": 169.88, + "step": 20, + "token_acc": 0.865086333040679, + "train_speed(iter/s)": 0.240866 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.1500141620635986, + "eval_runtime": 1.8151, + "eval_samples_per_second": 2.204, + "eval_steps_per_second": 2.204, + "eval_token_acc": 0.7232767232767233, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.4805212914943695, + "learning_rate": 5e-05, + "loss": 0.5110222339630127, + "memory(GiB)": 178.91, + "step": 25, + "token_acc": 0.8279441117764471, + "train_speed(iter/s)": 0.217821 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.0994250774383545, + "learning_rate": 6e-05, + "loss": 0.5762582778930664, + "memory(GiB)": 178.91, + "step": 30, + "token_acc": 0.8128789462680326, + "train_speed(iter/s)": 0.225877 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.30022498965263367, + "learning_rate": 7e-05, + "loss": 0.3708608150482178, + "memory(GiB)": 178.91, + "step": 35, + "token_acc": 0.8526220614828209, + "train_speed(iter/s)": 0.229602 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.31309282779693604, + "learning_rate": 8e-05, + "loss": 0.44199090003967284, + "memory(GiB)": 178.91, + "step": 40, + "token_acc": 0.8467268299670534, + "train_speed(iter/s)": 0.229371 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5262672305107117, + "eval_runtime": 1.7976, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7352647352647352, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.4576653838157654, + "learning_rate": 9e-05, + "loss": 0.4439223289489746, + "memory(GiB)": 178.91, + "step": 45, + "token_acc": 0.837237851662404, + "train_speed(iter/s)": 0.220221 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.3684069514274597, + "learning_rate": 0.0001, + "loss": 0.5241156101226807, + "memory(GiB)": 178.91, + "step": 50, + "token_acc": 0.8669076569175156, + "train_speed(iter/s)": 0.215242 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.4167088270187378, + "learning_rate": 9.999301905929286e-05, + "loss": 0.4691337585449219, + "memory(GiB)": 178.91, + "step": 55, + "token_acc": 0.8475441501103753, + "train_speed(iter/s)": 0.215104 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.4622468948364258, + "learning_rate": 9.997207818651274e-05, + "loss": 0.36278769969940183, + "memory(GiB)": 178.92, + "step": 60, + "token_acc": 0.8557241379310345, + "train_speed(iter/s)": 0.220608 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.4990268349647522, + "eval_runtime": 1.7923, + "eval_samples_per_second": 2.232, + "eval_steps_per_second": 2.232, + "eval_token_acc": 0.7402597402597403, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.6556711792945862, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5440140247344971, + "memory(GiB)": 178.92, + "step": 65, + "token_acc": 0.8285714285714286, + "train_speed(iter/s)": 0.215587 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.27956634759902954, + "learning_rate": 9.988834393115767e-05, + "loss": 0.40032358169555665, + "memory(GiB)": 178.92, + "step": 70, + "token_acc": 0.8736349453978159, + "train_speed(iter/s)": 0.213465 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.4587284028530121, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5715262413024902, + "memory(GiB)": 178.92, + "step": 75, + "token_acc": 0.833641404805915, + "train_speed(iter/s)": 0.217719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.3916527330875397, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6002557277679443, + "memory(GiB)": 178.95, + "step": 80, + "token_acc": 0.8368476147749364, + "train_speed(iter/s)": 0.218182 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5162495970726013, + "eval_runtime": 1.8197, + "eval_samples_per_second": 2.198, + "eval_steps_per_second": 2.198, + "eval_token_acc": 0.7392607392607392, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.27539631724357605, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5449264526367188, + "memory(GiB)": 178.95, + "step": 85, + "token_acc": 0.8142857142857143, + "train_speed(iter/s)": 0.214816 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.6312692761421204, + "learning_rate": 9.9553874407739e-05, + "loss": 0.43953518867492675, + "memory(GiB)": 178.95, + "step": 90, + "token_acc": 0.8481365377917102, + "train_speed(iter/s)": 0.216199 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.33478057384490967, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3530363798141479, + "memory(GiB)": 178.95, + "step": 95, + "token_acc": 0.8667912439935932, + "train_speed(iter/s)": 0.215501 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.2933506965637207, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4162618637084961, + "memory(GiB)": 194.37, + "step": 100, + "token_acc": 0.8655149051490515, + "train_speed(iter/s)": 0.212789 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5181649327278137, + "eval_runtime": 1.7982, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7402597402597403, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.48549583554267883, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5441759586334228, + "memory(GiB)": 194.37, + "step": 105, + "token_acc": 0.8028541226215645, + "train_speed(iter/s)": 0.212013 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.273707628250122, + "learning_rate": 9.899808525182935e-05, + "loss": 0.5125463962554931, + "memory(GiB)": 194.37, + "step": 110, + "token_acc": 0.8068669527896996, + "train_speed(iter/s)": 0.216603 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.24062682688236237, + "learning_rate": 9.882482608435923e-05, + "loss": 0.45648856163024903, + "memory(GiB)": 194.37, + "step": 115, + "token_acc": 0.8614628614628614, + "train_speed(iter/s)": 0.213323 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7656003832817078, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5108787059783936, + "memory(GiB)": 194.37, + "step": 120, + "token_acc": 0.8145441030723488, + "train_speed(iter/s)": 0.212345 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.49168646335601807, + "eval_runtime": 1.7863, + "eval_samples_per_second": 2.239, + "eval_steps_per_second": 2.239, + "eval_token_acc": 0.7402597402597403, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.3627430200576782, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4693441867828369, + "memory(GiB)": 194.37, + "step": 125, + "token_acc": 0.8352281825460368, + "train_speed(iter/s)": 0.212255 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.39431232213974, + "learning_rate": 9.822345875271883e-05, + "loss": 0.47632036209106443, + "memory(GiB)": 194.37, + "step": 130, + "token_acc": 0.8450905624404195, + "train_speed(iter/s)": 0.211377 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.22374582290649414, + "learning_rate": 9.799599295015154e-05, + "loss": 0.3673699378967285, + "memory(GiB)": 194.37, + "step": 135, + "token_acc": 0.8731134712129681, + "train_speed(iter/s)": 0.211464 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.3889102041721344, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5292303085327148, + "memory(GiB)": 194.37, + "step": 140, + "token_acc": 0.8273034877667881, + "train_speed(iter/s)": 0.211083 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.43900543451309204, + "eval_runtime": 1.8334, + "eval_samples_per_second": 2.182, + "eval_steps_per_second": 2.182, + "eval_token_acc": 0.7422577422577422, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.5705273151397705, + "learning_rate": 9.750092174273521e-05, + "loss": 0.34286372661590575, + "memory(GiB)": 194.37, + "step": 145, + "token_acc": 0.8596546310832025, + "train_speed(iter/s)": 0.208774 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.9250723123550415, + "learning_rate": 9.723345458039594e-05, + "loss": 0.4045823097229004, + "memory(GiB)": 194.37, + "step": 150, + "token_acc": 0.8724293596388564, + "train_speed(iter/s)": 0.20966 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.4707590639591217, + "learning_rate": 9.69527980602239e-05, + "loss": 0.4202705383300781, + "memory(GiB)": 194.42, + "step": 155, + "token_acc": 0.8564288391853055, + "train_speed(iter/s)": 0.209281 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.4233284890651703, + "learning_rate": 9.665903055208014e-05, + "loss": 0.34715895652770995, + "memory(GiB)": 194.42, + "step": 160, + "token_acc": 0.8820569271898098, + "train_speed(iter/s)": 0.210051 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.4276379346847534, + "eval_runtime": 1.8048, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 2.216, + "eval_token_acc": 0.7472527472527473, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.2773014008998871, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4147395133972168, + "memory(GiB)": 194.42, + "step": 165, + "token_acc": 0.8452348628835189, + "train_speed(iter/s)": 0.209147 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.4066998064517975, + "learning_rate": 9.603249433382144e-05, + "loss": 0.4583017349243164, + "memory(GiB)": 194.42, + "step": 170, + "token_acc": 0.853536021150033, + "train_speed(iter/s)": 0.208696 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.49174752831459045, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4163652896881104, + "memory(GiB)": 194.42, + "step": 175, + "token_acc": 0.8529512111907199, + "train_speed(iter/s)": 0.209604 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3412103056907654, + "learning_rate": 9.535454568671704e-05, + "loss": 0.422959041595459, + "memory(GiB)": 194.42, + "step": 180, + "token_acc": 0.8615751789976134, + "train_speed(iter/s)": 0.21007 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.4359378516674042, + "eval_runtime": 1.7765, + "eval_samples_per_second": 2.252, + "eval_steps_per_second": 2.252, + "eval_token_acc": 0.7552447552447552, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.481300413608551, + "learning_rate": 9.49965261014704e-05, + "loss": 0.4941267967224121, + "memory(GiB)": 194.42, + "step": 185, + "token_acc": 0.8200705112062453, + "train_speed(iter/s)": 0.20925 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.57511305809021, + "learning_rate": 9.462594179299406e-05, + "loss": 0.7393960952758789, + "memory(GiB)": 194.42, + "step": 190, + "token_acc": 0.799098337559876, + "train_speed(iter/s)": 0.210972 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.27773517370224, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5489778041839599, + "memory(GiB)": 194.42, + "step": 195, + "token_acc": 0.8190687361419069, + "train_speed(iter/s)": 0.210161 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.2995460033416748, + "learning_rate": 9.384749641033359e-05, + "loss": 0.4746543407440186, + "memory(GiB)": 194.42, + "step": 200, + "token_acc": 0.8563213924935893, + "train_speed(iter/s)": 0.207474 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4713101387023926, + "eval_runtime": 1.7784, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 2.249, + "eval_token_acc": 0.7502497502497503, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.2918975055217743, + "learning_rate": 9.343985270739182e-05, + "loss": 0.4258098602294922, + "memory(GiB)": 194.42, + "step": 205, + "token_acc": 0.8379081675480567, + "train_speed(iter/s)": 0.204959 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.4730541408061981, + "learning_rate": 9.302007896300698e-05, + "loss": 0.3576375722885132, + "memory(GiB)": 194.42, + "step": 210, + "token_acc": 0.8768303186907838, + "train_speed(iter/s)": 0.204647 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.3962819576263428, + "learning_rate": 9.25882923938038e-05, + "loss": 0.330736780166626, + "memory(GiB)": 194.42, + "step": 215, + "token_acc": 0.8893819007326386, + "train_speed(iter/s)": 0.203729 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.551486074924469, + "learning_rate": 9.214461357083985e-05, + "loss": 0.29806084632873536, + "memory(GiB)": 194.42, + "step": 220, + "token_acc": 0.9017960602549246, + "train_speed(iter/s)": 0.204164 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4675810933113098, + "eval_runtime": 1.7981, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7492507492507493, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.5311359167098999, + "learning_rate": 9.168916638593736e-05, + "loss": 0.42431864738464353, + "memory(GiB)": 194.42, + "step": 225, + "token_acc": 0.8460293607675413, + "train_speed(iter/s)": 0.203358 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.39372366666793823, + "learning_rate": 9.122207801708802e-05, + "loss": 0.36201488971710205, + "memory(GiB)": 194.42, + "step": 230, + "token_acc": 0.8752454042477245, + "train_speed(iter/s)": 0.202013 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.48343992233276367, + "learning_rate": 9.074347889294016e-05, + "loss": 0.1575523853302002, + "memory(GiB)": 194.42, + "step": 235, + "token_acc": 0.9267042542286007, + "train_speed(iter/s)": 0.203514 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.7413302659988403, + "learning_rate": 9.025350265637815e-05, + "loss": 0.38658602237701417, + "memory(GiB)": 194.42, + "step": 240, + "token_acc": 0.8714543367765207, + "train_speed(iter/s)": 0.204106 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5141359567642212, + "eval_runtime": 1.7909, + "eval_samples_per_second": 2.233, + "eval_steps_per_second": 2.233, + "eval_token_acc": 0.7432567432567433, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.4979130029678345, + "learning_rate": 8.975228612720416e-05, + "loss": 0.24457972049713134, + "memory(GiB)": 194.42, + "step": 245, + "token_acc": 0.8632831873036866, + "train_speed(iter/s)": 0.204329 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.7785468101501465, + "learning_rate": 8.923996926393305e-05, + "loss": 0.38393685817718504, + "memory(GiB)": 194.42, + "step": 250, + "token_acc": 0.8645948945615982, + "train_speed(iter/s)": 0.205736 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.429641991853714, + "learning_rate": 8.871669512471068e-05, + "loss": 0.34473409652709963, + "memory(GiB)": 194.42, + "step": 255, + "token_acc": 0.8690122539918307, + "train_speed(iter/s)": 0.204547 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.6987989544868469, + "learning_rate": 8.818260982736661e-05, + "loss": 0.32251389026641847, + "memory(GiB)": 194.42, + "step": 260, + "token_acc": 0.8750648901193978, + "train_speed(iter/s)": 0.2052 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4578605592250824, + "eval_runtime": 1.7868, + "eval_samples_per_second": 2.239, + "eval_steps_per_second": 2.239, + "eval_token_acc": 0.7602397602397603, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.22191783785820007, + "learning_rate": 8.763786250861256e-05, + "loss": 0.2529636859893799, + "memory(GiB)": 194.42, + "step": 265, + "token_acc": 0.899982859101817, + "train_speed(iter/s)": 0.203055 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.4377196431159973, + "learning_rate": 8.708260528239788e-05, + "loss": 0.23594443798065184, + "memory(GiB)": 194.42, + "step": 270, + "token_acc": 0.9083276216586703, + "train_speed(iter/s)": 0.20368 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.3827890455722809, + "learning_rate": 8.651699319743347e-05, + "loss": 0.25529866218566893, + "memory(GiB)": 194.42, + "step": 275, + "token_acc": 0.9052517596101787, + "train_speed(iter/s)": 0.203283 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.8312171697616577, + "learning_rate": 8.594118419389647e-05, + "loss": 0.3728013038635254, + "memory(GiB)": 194.42, + "step": 280, + "token_acc": 0.8851649320867878, + "train_speed(iter/s)": 0.203978 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.4208544194698334, + "eval_runtime": 1.7793, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 2.248, + "eval_token_acc": 0.7592407592407593, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.7446438074111938, + "learning_rate": 8.535533905932738e-05, + "loss": 0.17479790449142457, + "memory(GiB)": 194.42, + "step": 285, + "token_acc": 0.9063637940003468, + "train_speed(iter/s)": 0.20415 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.7986549139022827, + "learning_rate": 8.475962138373213e-05, + "loss": 0.33746435642242434, + "memory(GiB)": 194.42, + "step": 290, + "token_acc": 0.8648478488982162, + "train_speed(iter/s)": 0.205059 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.4530141353607178, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3507548332214355, + "memory(GiB)": 194.42, + "step": 295, + "token_acc": 0.8484919335983165, + "train_speed(iter/s)": 0.206092 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.5765312910079956, + "learning_rate": 8.353923650696118e-05, + "loss": 0.2767606496810913, + "memory(GiB)": 194.42, + "step": 300, + "token_acc": 0.8876570583887657, + "train_speed(iter/s)": 0.206251 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.4114963114261627, + "eval_runtime": 1.7835, + "eval_samples_per_second": 2.243, + "eval_steps_per_second": 2.243, + "eval_token_acc": 0.7542457542457542, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.5467161536216736, + "learning_rate": 8.291491008316409e-05, + "loss": 0.33141241073608396, + "memory(GiB)": 194.42, + "step": 305, + "token_acc": 0.8746642793196061, + "train_speed(iter/s)": 0.205011 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.5182997584342957, + "learning_rate": 8.228139257794012e-05, + "loss": 0.2601468086242676, + "memory(GiB)": 194.42, + "step": 310, + "token_acc": 0.9101903695408735, + "train_speed(iter/s)": 0.205931 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 1.4313561916351318, + "learning_rate": 8.163886089321493e-05, + "loss": 0.22466778755187988, + "memory(GiB)": 194.42, + "step": 315, + "token_acc": 0.9071316614420063, + "train_speed(iter/s)": 0.206665 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.4848739504814148, + "learning_rate": 8.098749444801224e-05, + "loss": 0.2971429586410522, + "memory(GiB)": 194.42, + "step": 320, + "token_acc": 0.9094011790257525, + "train_speed(iter/s)": 0.206913 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.34123557806015015, + "eval_runtime": 1.7732, + "eval_samples_per_second": 2.256, + "eval_steps_per_second": 2.256, + "eval_token_acc": 0.7562437562437563, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.5274702906608582, + "learning_rate": 8.032747512835337e-05, + "loss": 0.30359759330749514, + "memory(GiB)": 194.42, + "step": 325, + "token_acc": 0.8647662793839502, + "train_speed(iter/s)": 0.206592 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7116047739982605, + "learning_rate": 7.965898723646776e-05, + "loss": 0.369882869720459, + "memory(GiB)": 194.42, + "step": 330, + "token_acc": 0.8955607977696761, + "train_speed(iter/s)": 0.207439 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.46889209747314453, + "learning_rate": 7.898221743932888e-05, + "loss": 0.2918565034866333, + "memory(GiB)": 194.42, + "step": 335, + "token_acc": 0.89257481648786, + "train_speed(iter/s)": 0.207421 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.976447582244873, + "learning_rate": 7.829735471652978e-05, + "loss": 0.20344338417053223, + "memory(GiB)": 194.42, + "step": 340, + "token_acc": 0.9203519855595668, + "train_speed(iter/s)": 0.208201 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.34089022874832153, + "eval_runtime": 1.8007, + "eval_samples_per_second": 2.221, + "eval_steps_per_second": 2.221, + "eval_token_acc": 0.7562437562437563, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.5278966426849365, + "learning_rate": 7.760459030751284e-05, + "loss": 0.24397382736206055, + "memory(GiB)": 194.42, + "step": 345, + "token_acc": 0.9052574525745257, + "train_speed(iter/s)": 0.207523 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.4450560808181763, + "learning_rate": 7.690411765816864e-05, + "loss": 0.18279753923416137, + "memory(GiB)": 194.42, + "step": 350, + "token_acc": 0.9326676907322069, + "train_speed(iter/s)": 0.208339 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.9007227420806885, + "learning_rate": 7.619613236681843e-05, + "loss": 0.3635742664337158, + "memory(GiB)": 194.42, + "step": 355, + "token_acc": 0.8810979752683789, + "train_speed(iter/s)": 0.208407 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3327200412750244, + "learning_rate": 7.548083212959588e-05, + "loss": 0.2239600419998169, + "memory(GiB)": 194.42, + "step": 360, + "token_acc": 0.908592093777279, + "train_speed(iter/s)": 0.208397 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.3721417784690857, + "eval_runtime": 1.8042, + "eval_samples_per_second": 2.217, + "eval_steps_per_second": 2.217, + "eval_token_acc": 0.7482517482517482, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.898142397403717, + "learning_rate": 7.475841668524268e-05, + "loss": 0.2930032253265381, + "memory(GiB)": 194.42, + "step": 365, + "token_acc": 0.8698196001950268, + "train_speed(iter/s)": 0.208386 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.37038329243659973, + "learning_rate": 7.402908775933419e-05, + "loss": 0.30235629081726073, + "memory(GiB)": 194.42, + "step": 370, + "token_acc": 0.8878266411727215, + "train_speed(iter/s)": 0.208339 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.5051683187484741, + "learning_rate": 7.329304900794991e-05, + "loss": 0.3541992664337158, + "memory(GiB)": 194.42, + "step": 375, + "token_acc": 0.8819656712908536, + "train_speed(iter/s)": 0.208034 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.716480016708374, + "learning_rate": 7.255050596080509e-05, + "loss": 0.31901164054870607, + "memory(GiB)": 194.42, + "step": 380, + "token_acc": 0.8962932111620159, + "train_speed(iter/s)": 0.208147 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.31777071952819824, + "eval_runtime": 1.8015, + "eval_samples_per_second": 2.22, + "eval_steps_per_second": 2.22, + "eval_token_acc": 0.7632367632367633, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.24409626424312592, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3302799701690674, + "memory(GiB)": 194.42, + "step": 385, + "token_acc": 0.8846450617283951, + "train_speed(iter/s)": 0.207591 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.41487976908683777, + "learning_rate": 7.104673812141675e-05, + "loss": 0.23348629474639893, + "memory(GiB)": 194.42, + "step": 390, + "token_acc": 0.9038998906572713, + "train_speed(iter/s)": 0.207441 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3993523418903351, + "learning_rate": 7.02859332377382e-05, + "loss": 0.19255179166793823, + "memory(GiB)": 194.42, + "step": 395, + "token_acc": 0.9186485885752389, + "train_speed(iter/s)": 0.208111 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.3870049715042114, + "learning_rate": 6.951946375817474e-05, + "loss": 0.17468175888061524, + "memory(GiB)": 194.42, + "step": 400, + "token_acc": 0.9561740243122201, + "train_speed(iter/s)": 0.209087 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.28842809796333313, + "eval_runtime": 1.7921, + "eval_samples_per_second": 2.232, + "eval_steps_per_second": 2.232, + "eval_token_acc": 0.7612387612387612, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.8590208292007446, + "learning_rate": 6.874754370984606e-05, + "loss": 0.1243563175201416, + "memory(GiB)": 194.42, + "step": 405, + "token_acc": 0.9203394470298385, + "train_speed(iter/s)": 0.208801 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 1.1880351305007935, + "learning_rate": 6.797038864187564e-05, + "loss": 0.12814103364944457, + "memory(GiB)": 194.42, + "step": 410, + "token_acc": 0.9328753399169887, + "train_speed(iter/s)": 0.208916 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.5621036887168884, + "learning_rate": 6.718821556520151e-05, + "loss": 0.1351101279258728, + "memory(GiB)": 194.42, + "step": 415, + "token_acc": 0.9418245923314236, + "train_speed(iter/s)": 0.20947 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.3390972912311554, + "learning_rate": 6.640124289197845e-05, + "loss": 0.08711874485015869, + "memory(GiB)": 194.42, + "step": 420, + "token_acc": 0.9826417141307295, + "train_speed(iter/s)": 0.21021 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.2877776026725769, + "eval_runtime": 1.7912, + "eval_samples_per_second": 2.233, + "eval_steps_per_second": 2.233, + "eval_token_acc": 0.7612387612387612, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.6231220960617065, + "learning_rate": 6.560969037458933e-05, + "loss": 0.14229173660278321, + "memory(GiB)": 194.42, + "step": 425, + "token_acc": 0.9111275964391692, + "train_speed(iter/s)": 0.210123 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.2991831600666046, + "learning_rate": 6.481377904428171e-05, + "loss": 0.09396474957466125, + "memory(GiB)": 194.42, + "step": 430, + "token_acc": 0.9717786854808763, + "train_speed(iter/s)": 0.209878 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.2828837037086487, + "learning_rate": 6.401373114944781e-05, + "loss": 0.1196476936340332, + "memory(GiB)": 194.42, + "step": 435, + "token_acc": 0.9668395702111894, + "train_speed(iter/s)": 0.209161 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.8178226947784424, + "learning_rate": 6.320977009356431e-05, + "loss": 0.13254005908966066, + "memory(GiB)": 194.42, + "step": 440, + "token_acc": 0.9543882759756298, + "train_speed(iter/s)": 0.209483 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.3008834719657898, + "eval_runtime": 1.7791, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 2.248, + "eval_token_acc": 0.7562437562437563, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.4310154318809509, + "learning_rate": 6.240212037280966e-05, + "loss": 0.055006617307662965, + "memory(GiB)": 194.42, + "step": 445, + "token_acc": 0.9505077773492737, + "train_speed(iter/s)": 0.209114 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.8850460648536682, + "learning_rate": 6.159100751337642e-05, + "loss": 0.1947050929069519, + "memory(GiB)": 194.42, + "step": 450, + "token_acc": 0.932483120780195, + "train_speed(iter/s)": 0.209501 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.4071020185947418, + "learning_rate": 6.077665800849568e-05, + "loss": 0.0962955117225647, + "memory(GiB)": 194.42, + "step": 455, + "token_acc": 0.9671393509680938, + "train_speed(iter/s)": 0.209586 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.41429275274276733, + "learning_rate": 5.99592992551918e-05, + "loss": 0.11653723716735839, + "memory(GiB)": 194.42, + "step": 460, + "token_acc": 0.9625259129325466, + "train_speed(iter/s)": 0.209806 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3004966974258423, + "eval_runtime": 1.7662, + "eval_samples_per_second": 2.265, + "eval_steps_per_second": 2.265, + "eval_token_acc": 0.7532467532467533, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.5336225628852844, + "learning_rate": 5.913915949078452e-05, + "loss": 0.12927284240722656, + "memory(GiB)": 194.42, + "step": 465, + "token_acc": 0.9099099099099099, + "train_speed(iter/s)": 0.210062 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.6900014281272888, + "learning_rate": 5.831646772915651e-05, + "loss": 0.08137755393981934, + "memory(GiB)": 194.42, + "step": 470, + "token_acc": 0.9656623081296191, + "train_speed(iter/s)": 0.209705 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 0.8796831369400024, + "learning_rate": 5.749145369680407e-05, + "loss": 0.10649137496948242, + "memory(GiB)": 194.42, + "step": 475, + "token_acc": 0.9598332701780977, + "train_speed(iter/s)": 0.210106 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.33510109782218933, + "learning_rate": 5.666434776868895e-05, + "loss": 0.17189998626708985, + "memory(GiB)": 194.42, + "step": 480, + "token_acc": 0.9404330609149886, + "train_speed(iter/s)": 0.209154 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.3011992573738098, + "eval_runtime": 1.7649, + "eval_samples_per_second": 2.266, + "eval_steps_per_second": 2.266, + "eval_token_acc": 0.7582417582417582, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.720079243183136, + "learning_rate": 5.583538090390882e-05, + "loss": 0.0982659637928009, + "memory(GiB)": 194.42, + "step": 485, + "token_acc": 0.9170192759755524, + "train_speed(iter/s)": 0.209559 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5884720087051392, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.19064462184906006, + "memory(GiB)": 194.42, + "step": 490, + "token_acc": 0.9257759784075573, + "train_speed(iter/s)": 0.209499 + }, + { + "epoch": 2.5, + "grad_norm": 0.33910003304481506, + "learning_rate": 5.41727907343245e-05, + "loss": 0.10975323915481568, + "memory(GiB)": 194.42, + "step": 495, + "token_acc": 0.958795231123196, + "train_speed(iter/s)": 0.209984 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.5363466143608093, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.17116068601608275, + "memory(GiB)": 194.42, + "step": 500, + "token_acc": 0.9163900944600459, + "train_speed(iter/s)": 0.209844 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.2906345725059509, + "eval_runtime": 1.8092, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7522477522477522, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.6032365560531616, + "learning_rate": 5.250554008935596e-05, + "loss": 0.09048279523849487, + "memory(GiB)": 194.42, + "step": 505, + "token_acc": 0.9289609432571849, + "train_speed(iter/s)": 0.209717 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.6278122067451477, + "learning_rate": 5.167074885038373e-05, + "loss": 0.09625995755195618, + "memory(GiB)": 194.42, + "step": 510, + "token_acc": 0.9664948453608248, + "train_speed(iter/s)": 0.210284 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.38628754019737244, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.13246564865112304, + "memory(GiB)": 194.42, + "step": 515, + "token_acc": 0.9501214574898785, + "train_speed(iter/s)": 0.210549 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.465506374835968, + "learning_rate": 5e-05, + "loss": 0.13336524963378907, + "memory(GiB)": 194.42, + "step": 520, + "token_acc": 0.942733657482442, + "train_speed(iter/s)": 0.210504 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.2984510660171509, + "eval_runtime": 1.8075, + "eval_samples_per_second": 2.213, + "eval_steps_per_second": 2.213, + "eval_token_acc": 0.7502497502497503, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.4542198181152344, + "learning_rate": 4.916450892453495e-05, + "loss": 0.09229624271392822, + "memory(GiB)": 194.42, + "step": 525, + "token_acc": 0.931699604743083, + "train_speed(iter/s)": 0.210556 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3877967596054077, + "learning_rate": 4.832925114961629e-05, + "loss": 0.16157912015914916, + "memory(GiB)": 194.42, + "step": 530, + "token_acc": 0.9453226706341826, + "train_speed(iter/s)": 0.209926 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7550373077392578, + "learning_rate": 4.749445991064404e-05, + "loss": 0.09830494523048401, + "memory(GiB)": 194.42, + "step": 535, + "token_acc": 0.9667737290951379, + "train_speed(iter/s)": 0.209454 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.8958114981651306, + "learning_rate": 4.666036831274392e-05, + "loss": 0.21627907752990722, + "memory(GiB)": 194.42, + "step": 540, + "token_acc": 0.91897499740637, + "train_speed(iter/s)": 0.20902 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.29240885376930237, + "eval_runtime": 1.8087, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7582417582417582, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.6230780482292175, + "learning_rate": 4.582720926567552e-05, + "loss": 0.15035767555236818, + "memory(GiB)": 194.42, + "step": 545, + "token_acc": 0.9105213715359324, + "train_speed(iter/s)": 0.208676 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.2731610834598541, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.10666660070419312, + "memory(GiB)": 194.42, + "step": 550, + "token_acc": 0.9625537139349294, + "train_speed(iter/s)": 0.20872 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.5083901286125183, + "learning_rate": 4.416461909609119e-05, + "loss": 0.092490154504776, + "memory(GiB)": 194.42, + "step": 555, + "token_acc": 0.9688758129451843, + "train_speed(iter/s)": 0.208968 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.3543936014175415, + "learning_rate": 4.333565223131107e-05, + "loss": 0.12382739782333374, + "memory(GiB)": 194.42, + "step": 560, + "token_acc": 0.9427140588738243, + "train_speed(iter/s)": 0.208825 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.28748947381973267, + "eval_runtime": 1.8169, + "eval_samples_per_second": 2.202, + "eval_steps_per_second": 2.202, + "eval_token_acc": 0.7542457542457542, + "step": 560 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.5554716213692006e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/training_args.bin b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2ade5288a82aa26858fccdfbb4e928d25b606382 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea0b2f710093e3c5b35669dba40183b02dbc7c28db39c667ac4476d3bbfefe6 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/README.md b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2661c8cb0636171d7cff720d2f18e9e0d82c323b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-70b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4678d8c7e244c928931ecfbe2de2ca148cb81822 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "k_proj", + "v_proj", + "up_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_model.safetensors b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d3bebc2e2c64f853e284d7724b72fc6ba4cd1c7 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e20b5c3d0327f035cc8b317d1c839b035ad0650e4657509d1a93aadf04fe4c7 +size 828526568 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/additional_config.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/args.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4d974a37c655ea9a43cc737104f9f8b6bd1bdd8d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/args.json @@ -0,0 +1,305 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_sft_0.5_what.json" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 2, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": -1, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "rank": -1, + "global_world_size": 1, + "local_world_size": 1, + "model_suffix": "deepseek-r1-70b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config={'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-70b", + "hub": "", + "training_args": "Seq2SeqTrainingArguments(output_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=2, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/optimizer.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b03e19d37610f18d7c335152ed470ddd7de6642b --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8eccf6f05cb6b83780c5d89b36ad0d1764a79fd06b6070e4685683d2aa876ca +size 1657698290 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/rng_state.pth b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1d353417fbdb10a5af00c482de0925d280cdf10 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2324f2d3860be12513aaa6551fdbada599d74005d8f080e5be7460b8baaf9c8 +size 14244 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/scheduler.pt b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f8882c16fb0abc091aaea5286781182c084d87d --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b3a16451354ac84ec594942621c3011b01d575ac8a6b2fa4481b0291c904a7 +size 1064 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/trainer_state.json b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7a05c11f7cf78718d0bb0b6b5e723f22912fc9 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/trainer_state.json @@ -0,0 +1,2473 @@ +{ + "best_metric": 0.28748947, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 990, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 0.5212649703025818, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5845019221305847, + "memory(GiB)": 143.92, + "step": 1, + "token_acc": 0.8486238532110092, + "train_speed(iter/s)": 0.152386 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 0.8231858015060425, + "learning_rate": 1e-05, + "loss": 0.7612156867980957, + "memory(GiB)": 153.24, + "step": 5, + "token_acc": 0.8290306867998052, + "train_speed(iter/s)": 0.221696 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 0.613670825958252, + "learning_rate": 2e-05, + "loss": 0.8103227615356445, + "memory(GiB)": 160.15, + "step": 10, + "token_acc": 0.7863496684457383, + "train_speed(iter/s)": 0.228033 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.5065694451332092, + "learning_rate": 3e-05, + "loss": 0.7224256038665772, + "memory(GiB)": 169.88, + "step": 15, + "token_acc": 0.8002382843526609, + "train_speed(iter/s)": 0.222881 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 5.616745471954346, + "learning_rate": 4e-05, + "loss": 0.783719539642334, + "memory(GiB)": 169.88, + "step": 20, + "token_acc": 0.865086333040679, + "train_speed(iter/s)": 0.240866 + }, + { + "epoch": 0.10101010101010101, + "eval_loss": 1.1500141620635986, + "eval_runtime": 1.8151, + "eval_samples_per_second": 2.204, + "eval_steps_per_second": 2.204, + "eval_token_acc": 0.7232767232767233, + "step": 20 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.4805212914943695, + "learning_rate": 5e-05, + "loss": 0.5110222339630127, + "memory(GiB)": 178.91, + "step": 25, + "token_acc": 0.8279441117764471, + "train_speed(iter/s)": 0.217821 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.0994250774383545, + "learning_rate": 6e-05, + "loss": 0.5762582778930664, + "memory(GiB)": 178.91, + "step": 30, + "token_acc": 0.8128789462680326, + "train_speed(iter/s)": 0.225877 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.30022498965263367, + "learning_rate": 7e-05, + "loss": 0.3708608150482178, + "memory(GiB)": 178.91, + "step": 35, + "token_acc": 0.8526220614828209, + "train_speed(iter/s)": 0.229602 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.31309282779693604, + "learning_rate": 8e-05, + "loss": 0.44199090003967284, + "memory(GiB)": 178.91, + "step": 40, + "token_acc": 0.8467268299670534, + "train_speed(iter/s)": 0.229371 + }, + { + "epoch": 0.20202020202020202, + "eval_loss": 0.5262672305107117, + "eval_runtime": 1.7976, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7352647352647352, + "step": 40 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.4576653838157654, + "learning_rate": 9e-05, + "loss": 0.4439223289489746, + "memory(GiB)": 178.91, + "step": 45, + "token_acc": 0.837237851662404, + "train_speed(iter/s)": 0.220221 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.3684069514274597, + "learning_rate": 0.0001, + "loss": 0.5241156101226807, + "memory(GiB)": 178.91, + "step": 50, + "token_acc": 0.8669076569175156, + "train_speed(iter/s)": 0.215242 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.4167088270187378, + "learning_rate": 9.999301905929286e-05, + "loss": 0.4691337585449219, + "memory(GiB)": 178.91, + "step": 55, + "token_acc": 0.8475441501103753, + "train_speed(iter/s)": 0.215104 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.4622468948364258, + "learning_rate": 9.997207818651274e-05, + "loss": 0.36278769969940183, + "memory(GiB)": 178.92, + "step": 60, + "token_acc": 0.8557241379310345, + "train_speed(iter/s)": 0.220608 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 0.4990268349647522, + "eval_runtime": 1.7923, + "eval_samples_per_second": 2.232, + "eval_steps_per_second": 2.232, + "eval_token_acc": 0.7402597402597403, + "step": 60 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.6556711792945862, + "learning_rate": 9.99371832291393e-05, + "loss": 0.5440140247344971, + "memory(GiB)": 178.92, + "step": 65, + "token_acc": 0.8285714285714286, + "train_speed(iter/s)": 0.215587 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.27956634759902954, + "learning_rate": 9.988834393115767e-05, + "loss": 0.40032358169555665, + "memory(GiB)": 178.92, + "step": 70, + "token_acc": 0.8736349453978159, + "train_speed(iter/s)": 0.213465 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.4587284028530121, + "learning_rate": 9.982557393033758e-05, + "loss": 0.5715262413024902, + "memory(GiB)": 178.92, + "step": 75, + "token_acc": 0.833641404805915, + "train_speed(iter/s)": 0.217719 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.3916527330875397, + "learning_rate": 9.974889075442521e-05, + "loss": 0.6002557277679443, + "memory(GiB)": 178.95, + "step": 80, + "token_acc": 0.8368476147749364, + "train_speed(iter/s)": 0.218182 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.5162495970726013, + "eval_runtime": 1.8197, + "eval_samples_per_second": 2.198, + "eval_steps_per_second": 2.198, + "eval_token_acc": 0.7392607392607392, + "step": 80 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.27539631724357605, + "learning_rate": 9.965831581624871e-05, + "loss": 0.5449264526367188, + "memory(GiB)": 178.95, + "step": 85, + "token_acc": 0.8142857142857143, + "train_speed(iter/s)": 0.214816 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.6312692761421204, + "learning_rate": 9.9553874407739e-05, + "loss": 0.43953518867492675, + "memory(GiB)": 178.95, + "step": 90, + "token_acc": 0.8481365377917102, + "train_speed(iter/s)": 0.216199 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.33478057384490967, + "learning_rate": 9.94355956928673e-05, + "loss": 0.3530363798141479, + "memory(GiB)": 178.95, + "step": 95, + "token_acc": 0.8667912439935932, + "train_speed(iter/s)": 0.215501 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.2933506965637207, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4162618637084961, + "memory(GiB)": 194.37, + "step": 100, + "token_acc": 0.8655149051490515, + "train_speed(iter/s)": 0.212789 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 0.5181649327278137, + "eval_runtime": 1.7982, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7402597402597403, + "step": 100 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.48549583554267883, + "learning_rate": 9.915766231018318e-05, + "loss": 0.5441759586334228, + "memory(GiB)": 194.37, + "step": 105, + "token_acc": 0.8028541226215645, + "train_speed(iter/s)": 0.212013 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.273707628250122, + "learning_rate": 9.899808525182935e-05, + "loss": 0.5125463962554931, + "memory(GiB)": 194.37, + "step": 110, + "token_acc": 0.8068669527896996, + "train_speed(iter/s)": 0.216603 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.24062682688236237, + "learning_rate": 9.882482608435923e-05, + "loss": 0.45648856163024903, + "memory(GiB)": 194.37, + "step": 115, + "token_acc": 0.8614628614628614, + "train_speed(iter/s)": 0.213323 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7656003832817078, + "learning_rate": 9.863793318825186e-05, + "loss": 0.5108787059783936, + "memory(GiB)": 194.37, + "step": 120, + "token_acc": 0.8145441030723488, + "train_speed(iter/s)": 0.212345 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 0.49168646335601807, + "eval_runtime": 1.7863, + "eval_samples_per_second": 2.239, + "eval_steps_per_second": 2.239, + "eval_token_acc": 0.7402597402597403, + "step": 120 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.3627430200576782, + "learning_rate": 9.843745875103627e-05, + "loss": 0.4693441867828369, + "memory(GiB)": 194.37, + "step": 125, + "token_acc": 0.8352281825460368, + "train_speed(iter/s)": 0.212255 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.39431232213974, + "learning_rate": 9.822345875271883e-05, + "loss": 0.47632036209106443, + "memory(GiB)": 194.37, + "step": 130, + "token_acc": 0.8450905624404195, + "train_speed(iter/s)": 0.211377 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.22374582290649414, + "learning_rate": 9.799599295015154e-05, + "loss": 0.3673699378967285, + "memory(GiB)": 194.37, + "step": 135, + "token_acc": 0.8731134712129681, + "train_speed(iter/s)": 0.211464 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.3889102041721344, + "learning_rate": 9.775512486034563e-05, + "loss": 0.5292303085327148, + "memory(GiB)": 194.37, + "step": 140, + "token_acc": 0.8273034877667881, + "train_speed(iter/s)": 0.211083 + }, + { + "epoch": 0.7070707070707071, + "eval_loss": 0.43900543451309204, + "eval_runtime": 1.8334, + "eval_samples_per_second": 2.182, + "eval_steps_per_second": 2.182, + "eval_token_acc": 0.7422577422577422, + "step": 140 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.5705273151397705, + "learning_rate": 9.750092174273521e-05, + "loss": 0.34286372661590575, + "memory(GiB)": 194.37, + "step": 145, + "token_acc": 0.8596546310832025, + "train_speed(iter/s)": 0.208774 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.9250723123550415, + "learning_rate": 9.723345458039594e-05, + "loss": 0.4045823097229004, + "memory(GiB)": 194.37, + "step": 150, + "token_acc": 0.8724293596388564, + "train_speed(iter/s)": 0.20966 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.4707590639591217, + "learning_rate": 9.69527980602239e-05, + "loss": 0.4202705383300781, + "memory(GiB)": 194.42, + "step": 155, + "token_acc": 0.8564288391853055, + "train_speed(iter/s)": 0.209281 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.4233284890651703, + "learning_rate": 9.665903055208014e-05, + "loss": 0.34715895652770995, + "memory(GiB)": 194.42, + "step": 160, + "token_acc": 0.8820569271898098, + "train_speed(iter/s)": 0.210051 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.4276379346847534, + "eval_runtime": 1.8048, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 2.216, + "eval_token_acc": 0.7472527472527473, + "step": 160 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.2773014008998871, + "learning_rate": 9.635223408690688e-05, + "loss": 0.4147395133972168, + "memory(GiB)": 194.42, + "step": 165, + "token_acc": 0.8452348628835189, + "train_speed(iter/s)": 0.209147 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.4066998064517975, + "learning_rate": 9.603249433382144e-05, + "loss": 0.4583017349243164, + "memory(GiB)": 194.42, + "step": 170, + "token_acc": 0.853536021150033, + "train_speed(iter/s)": 0.208696 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.49174752831459045, + "learning_rate": 9.569990057619414e-05, + "loss": 0.4163652896881104, + "memory(GiB)": 194.42, + "step": 175, + "token_acc": 0.8529512111907199, + "train_speed(iter/s)": 0.209604 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3412103056907654, + "learning_rate": 9.535454568671704e-05, + "loss": 0.422959041595459, + "memory(GiB)": 194.42, + "step": 180, + "token_acc": 0.8615751789976134, + "train_speed(iter/s)": 0.21007 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 0.4359378516674042, + "eval_runtime": 1.7765, + "eval_samples_per_second": 2.252, + "eval_steps_per_second": 2.252, + "eval_token_acc": 0.7552447552447552, + "step": 180 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.481300413608551, + "learning_rate": 9.49965261014704e-05, + "loss": 0.4941267967224121, + "memory(GiB)": 194.42, + "step": 185, + "token_acc": 0.8200705112062453, + "train_speed(iter/s)": 0.20925 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 2.57511305809021, + "learning_rate": 9.462594179299406e-05, + "loss": 0.7393960952758789, + "memory(GiB)": 194.42, + "step": 190, + "token_acc": 0.799098337559876, + "train_speed(iter/s)": 0.210972 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.27773517370224, + "learning_rate": 9.424289624237144e-05, + "loss": 0.5489778041839599, + "memory(GiB)": 194.42, + "step": 195, + "token_acc": 0.8190687361419069, + "train_speed(iter/s)": 0.210161 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.2995460033416748, + "learning_rate": 9.384749641033359e-05, + "loss": 0.4746543407440186, + "memory(GiB)": 194.42, + "step": 200, + "token_acc": 0.8563213924935893, + "train_speed(iter/s)": 0.207474 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 0.4713101387023926, + "eval_runtime": 1.7784, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 2.249, + "eval_token_acc": 0.7502497502497503, + "step": 200 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.2918975055217743, + "learning_rate": 9.343985270739182e-05, + "loss": 0.4258098602294922, + "memory(GiB)": 194.42, + "step": 205, + "token_acc": 0.8379081675480567, + "train_speed(iter/s)": 0.204959 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.4730541408061981, + "learning_rate": 9.302007896300698e-05, + "loss": 0.3576375722885132, + "memory(GiB)": 194.42, + "step": 210, + "token_acc": 0.8768303186907838, + "train_speed(iter/s)": 0.204647 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.3962819576263428, + "learning_rate": 9.25882923938038e-05, + "loss": 0.330736780166626, + "memory(GiB)": 194.42, + "step": 215, + "token_acc": 0.8893819007326386, + "train_speed(iter/s)": 0.203729 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.551486074924469, + "learning_rate": 9.214461357083985e-05, + "loss": 0.29806084632873536, + "memory(GiB)": 194.42, + "step": 220, + "token_acc": 0.9017960602549246, + "train_speed(iter/s)": 0.204164 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.4675810933113098, + "eval_runtime": 1.7981, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 2.225, + "eval_token_acc": 0.7492507492507493, + "step": 220 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.5311359167098999, + "learning_rate": 9.168916638593736e-05, + "loss": 0.42431864738464353, + "memory(GiB)": 194.42, + "step": 225, + "token_acc": 0.8460293607675413, + "train_speed(iter/s)": 0.203358 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.39372366666793823, + "learning_rate": 9.122207801708802e-05, + "loss": 0.36201488971710205, + "memory(GiB)": 194.42, + "step": 230, + "token_acc": 0.8752454042477245, + "train_speed(iter/s)": 0.202013 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.48343992233276367, + "learning_rate": 9.074347889294016e-05, + "loss": 0.1575523853302002, + "memory(GiB)": 194.42, + "step": 235, + "token_acc": 0.9267042542286007, + "train_speed(iter/s)": 0.203514 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.7413302659988403, + "learning_rate": 9.025350265637815e-05, + "loss": 0.38658602237701417, + "memory(GiB)": 194.42, + "step": 240, + "token_acc": 0.8714543367765207, + "train_speed(iter/s)": 0.204106 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.5141359567642212, + "eval_runtime": 1.7909, + "eval_samples_per_second": 2.233, + "eval_steps_per_second": 2.233, + "eval_token_acc": 0.7432567432567433, + "step": 240 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.4979130029678345, + "learning_rate": 8.975228612720416e-05, + "loss": 0.24457972049713134, + "memory(GiB)": 194.42, + "step": 245, + "token_acc": 0.8632831873036866, + "train_speed(iter/s)": 0.204329 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.7785468101501465, + "learning_rate": 8.923996926393305e-05, + "loss": 0.38393685817718504, + "memory(GiB)": 194.42, + "step": 250, + "token_acc": 0.8645948945615982, + "train_speed(iter/s)": 0.205736 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.429641991853714, + "learning_rate": 8.871669512471068e-05, + "loss": 0.34473409652709963, + "memory(GiB)": 194.42, + "step": 255, + "token_acc": 0.8690122539918307, + "train_speed(iter/s)": 0.204547 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.6987989544868469, + "learning_rate": 8.818260982736661e-05, + "loss": 0.32251389026641847, + "memory(GiB)": 194.42, + "step": 260, + "token_acc": 0.8750648901193978, + "train_speed(iter/s)": 0.2052 + }, + { + "epoch": 1.3131313131313131, + "eval_loss": 0.4578605592250824, + "eval_runtime": 1.7868, + "eval_samples_per_second": 2.239, + "eval_steps_per_second": 2.239, + "eval_token_acc": 0.7602397602397603, + "step": 260 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.22191783785820007, + "learning_rate": 8.763786250861256e-05, + "loss": 0.2529636859893799, + "memory(GiB)": 194.42, + "step": 265, + "token_acc": 0.899982859101817, + "train_speed(iter/s)": 0.203055 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.4377196431159973, + "learning_rate": 8.708260528239788e-05, + "loss": 0.23594443798065184, + "memory(GiB)": 194.42, + "step": 270, + "token_acc": 0.9083276216586703, + "train_speed(iter/s)": 0.20368 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.3827890455722809, + "learning_rate": 8.651699319743347e-05, + "loss": 0.25529866218566893, + "memory(GiB)": 194.42, + "step": 275, + "token_acc": 0.9052517596101787, + "train_speed(iter/s)": 0.203283 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.8312171697616577, + "learning_rate": 8.594118419389647e-05, + "loss": 0.3728013038635254, + "memory(GiB)": 194.42, + "step": 280, + "token_acc": 0.8851649320867878, + "train_speed(iter/s)": 0.203978 + }, + { + "epoch": 1.4141414141414141, + "eval_loss": 0.4208544194698334, + "eval_runtime": 1.7793, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 2.248, + "eval_token_acc": 0.7592407592407593, + "step": 280 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.7446438074111938, + "learning_rate": 8.535533905932738e-05, + "loss": 0.17479790449142457, + "memory(GiB)": 194.42, + "step": 285, + "token_acc": 0.9063637940003468, + "train_speed(iter/s)": 0.20415 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.7986549139022827, + "learning_rate": 8.475962138373213e-05, + "loss": 0.33746435642242434, + "memory(GiB)": 194.42, + "step": 290, + "token_acc": 0.8648478488982162, + "train_speed(iter/s)": 0.205059 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.4530141353607178, + "learning_rate": 8.415419751390155e-05, + "loss": 0.3507548332214355, + "memory(GiB)": 194.42, + "step": 295, + "token_acc": 0.8484919335983165, + "train_speed(iter/s)": 0.206092 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.5765312910079956, + "learning_rate": 8.353923650696118e-05, + "loss": 0.2767606496810913, + "memory(GiB)": 194.42, + "step": 300, + "token_acc": 0.8876570583887657, + "train_speed(iter/s)": 0.206251 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.4114963114261627, + "eval_runtime": 1.7835, + "eval_samples_per_second": 2.243, + "eval_steps_per_second": 2.243, + "eval_token_acc": 0.7542457542457542, + "step": 300 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.5467161536216736, + "learning_rate": 8.291491008316409e-05, + "loss": 0.33141241073608396, + "memory(GiB)": 194.42, + "step": 305, + "token_acc": 0.8746642793196061, + "train_speed(iter/s)": 0.205011 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.5182997584342957, + "learning_rate": 8.228139257794012e-05, + "loss": 0.2601468086242676, + "memory(GiB)": 194.42, + "step": 310, + "token_acc": 0.9101903695408735, + "train_speed(iter/s)": 0.205931 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 1.4313561916351318, + "learning_rate": 8.163886089321493e-05, + "loss": 0.22466778755187988, + "memory(GiB)": 194.42, + "step": 315, + "token_acc": 0.9071316614420063, + "train_speed(iter/s)": 0.206665 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.4848739504814148, + "learning_rate": 8.098749444801224e-05, + "loss": 0.2971429586410522, + "memory(GiB)": 194.42, + "step": 320, + "token_acc": 0.9094011790257525, + "train_speed(iter/s)": 0.206913 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.34123557806015015, + "eval_runtime": 1.7732, + "eval_samples_per_second": 2.256, + "eval_steps_per_second": 2.256, + "eval_token_acc": 0.7562437562437563, + "step": 320 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.5274702906608582, + "learning_rate": 8.032747512835337e-05, + "loss": 0.30359759330749514, + "memory(GiB)": 194.42, + "step": 325, + "token_acc": 0.8647662793839502, + "train_speed(iter/s)": 0.206592 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7116047739982605, + "learning_rate": 7.965898723646776e-05, + "loss": 0.369882869720459, + "memory(GiB)": 194.42, + "step": 330, + "token_acc": 0.8955607977696761, + "train_speed(iter/s)": 0.207439 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.46889209747314453, + "learning_rate": 7.898221743932888e-05, + "loss": 0.2918565034866333, + "memory(GiB)": 194.42, + "step": 335, + "token_acc": 0.89257481648786, + "train_speed(iter/s)": 0.207421 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.976447582244873, + "learning_rate": 7.829735471652978e-05, + "loss": 0.20344338417053223, + "memory(GiB)": 194.42, + "step": 340, + "token_acc": 0.9203519855595668, + "train_speed(iter/s)": 0.208201 + }, + { + "epoch": 1.7171717171717171, + "eval_loss": 0.34089022874832153, + "eval_runtime": 1.8007, + "eval_samples_per_second": 2.221, + "eval_steps_per_second": 2.221, + "eval_token_acc": 0.7562437562437563, + "step": 340 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.5278966426849365, + "learning_rate": 7.760459030751284e-05, + "loss": 0.24397382736206055, + "memory(GiB)": 194.42, + "step": 345, + "token_acc": 0.9052574525745257, + "train_speed(iter/s)": 0.207523 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 1.4450560808181763, + "learning_rate": 7.690411765816864e-05, + "loss": 0.18279753923416137, + "memory(GiB)": 194.42, + "step": 350, + "token_acc": 0.9326676907322069, + "train_speed(iter/s)": 0.208339 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.9007227420806885, + "learning_rate": 7.619613236681843e-05, + "loss": 0.3635742664337158, + "memory(GiB)": 194.42, + "step": 355, + "token_acc": 0.8810979752683789, + "train_speed(iter/s)": 0.208407 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3327200412750244, + "learning_rate": 7.548083212959588e-05, + "loss": 0.2239600419998169, + "memory(GiB)": 194.42, + "step": 360, + "token_acc": 0.908592093777279, + "train_speed(iter/s)": 0.208397 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.3721417784690857, + "eval_runtime": 1.8042, + "eval_samples_per_second": 2.217, + "eval_steps_per_second": 2.217, + "eval_token_acc": 0.7482517482517482, + "step": 360 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.898142397403717, + "learning_rate": 7.475841668524268e-05, + "loss": 0.2930032253265381, + "memory(GiB)": 194.42, + "step": 365, + "token_acc": 0.8698196001950268, + "train_speed(iter/s)": 0.208386 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.37038329243659973, + "learning_rate": 7.402908775933419e-05, + "loss": 0.30235629081726073, + "memory(GiB)": 194.42, + "step": 370, + "token_acc": 0.8878266411727215, + "train_speed(iter/s)": 0.208339 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.5051683187484741, + "learning_rate": 7.329304900794991e-05, + "loss": 0.3541992664337158, + "memory(GiB)": 194.42, + "step": 375, + "token_acc": 0.8819656712908536, + "train_speed(iter/s)": 0.208034 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.716480016708374, + "learning_rate": 7.255050596080509e-05, + "loss": 0.31901164054870607, + "memory(GiB)": 194.42, + "step": 380, + "token_acc": 0.8962932111620159, + "train_speed(iter/s)": 0.208147 + }, + { + "epoch": 1.9191919191919191, + "eval_loss": 0.31777071952819824, + "eval_runtime": 1.8015, + "eval_samples_per_second": 2.22, + "eval_steps_per_second": 2.22, + "eval_token_acc": 0.7632367632367633, + "step": 380 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.24409626424312592, + "learning_rate": 7.180166596385914e-05, + "loss": 0.3302799701690674, + "memory(GiB)": 194.42, + "step": 385, + "token_acc": 0.8846450617283951, + "train_speed(iter/s)": 0.207591 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.41487976908683777, + "learning_rate": 7.104673812141675e-05, + "loss": 0.23348629474639893, + "memory(GiB)": 194.42, + "step": 390, + "token_acc": 0.9038998906572713, + "train_speed(iter/s)": 0.207441 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3993523418903351, + "learning_rate": 7.02859332377382e-05, + "loss": 0.19255179166793823, + "memory(GiB)": 194.42, + "step": 395, + "token_acc": 0.9186485885752389, + "train_speed(iter/s)": 0.208111 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.3870049715042114, + "learning_rate": 6.951946375817474e-05, + "loss": 0.17468175888061524, + "memory(GiB)": 194.42, + "step": 400, + "token_acc": 0.9561740243122201, + "train_speed(iter/s)": 0.209087 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.28842809796333313, + "eval_runtime": 1.7921, + "eval_samples_per_second": 2.232, + "eval_steps_per_second": 2.232, + "eval_token_acc": 0.7612387612387612, + "step": 400 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.8590208292007446, + "learning_rate": 6.874754370984606e-05, + "loss": 0.1243563175201416, + "memory(GiB)": 194.42, + "step": 405, + "token_acc": 0.9203394470298385, + "train_speed(iter/s)": 0.208801 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 1.1880351305007935, + "learning_rate": 6.797038864187564e-05, + "loss": 0.12814103364944457, + "memory(GiB)": 194.42, + "step": 410, + "token_acc": 0.9328753399169887, + "train_speed(iter/s)": 0.208916 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.5621036887168884, + "learning_rate": 6.718821556520151e-05, + "loss": 0.1351101279258728, + "memory(GiB)": 194.42, + "step": 415, + "token_acc": 0.9418245923314236, + "train_speed(iter/s)": 0.20947 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.3390972912311554, + "learning_rate": 6.640124289197845e-05, + "loss": 0.08711874485015869, + "memory(GiB)": 194.42, + "step": 420, + "token_acc": 0.9826417141307295, + "train_speed(iter/s)": 0.21021 + }, + { + "epoch": 2.121212121212121, + "eval_loss": 0.2877776026725769, + "eval_runtime": 1.7912, + "eval_samples_per_second": 2.233, + "eval_steps_per_second": 2.233, + "eval_token_acc": 0.7612387612387612, + "step": 420 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.6231220960617065, + "learning_rate": 6.560969037458933e-05, + "loss": 0.14229173660278321, + "memory(GiB)": 194.42, + "step": 425, + "token_acc": 0.9111275964391692, + "train_speed(iter/s)": 0.210123 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.2991831600666046, + "learning_rate": 6.481377904428171e-05, + "loss": 0.09396474957466125, + "memory(GiB)": 194.42, + "step": 430, + "token_acc": 0.9717786854808763, + "train_speed(iter/s)": 0.209878 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.2828837037086487, + "learning_rate": 6.401373114944781e-05, + "loss": 0.1196476936340332, + "memory(GiB)": 194.42, + "step": 435, + "token_acc": 0.9668395702111894, + "train_speed(iter/s)": 0.209161 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.8178226947784424, + "learning_rate": 6.320977009356431e-05, + "loss": 0.13254005908966066, + "memory(GiB)": 194.42, + "step": 440, + "token_acc": 0.9543882759756298, + "train_speed(iter/s)": 0.209483 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.3008834719657898, + "eval_runtime": 1.7791, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 2.248, + "eval_token_acc": 0.7562437562437563, + "step": 440 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.4310154318809509, + "learning_rate": 6.240212037280966e-05, + "loss": 0.055006617307662965, + "memory(GiB)": 194.42, + "step": 445, + "token_acc": 0.9505077773492737, + "train_speed(iter/s)": 0.209114 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.8850460648536682, + "learning_rate": 6.159100751337642e-05, + "loss": 0.1947050929069519, + "memory(GiB)": 194.42, + "step": 450, + "token_acc": 0.932483120780195, + "train_speed(iter/s)": 0.209501 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.4071020185947418, + "learning_rate": 6.077665800849568e-05, + "loss": 0.0962955117225647, + "memory(GiB)": 194.42, + "step": 455, + "token_acc": 0.9671393509680938, + "train_speed(iter/s)": 0.209586 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.41429275274276733, + "learning_rate": 5.99592992551918e-05, + "loss": 0.11653723716735839, + "memory(GiB)": 194.42, + "step": 460, + "token_acc": 0.9625259129325466, + "train_speed(iter/s)": 0.209806 + }, + { + "epoch": 2.323232323232323, + "eval_loss": 0.3004966974258423, + "eval_runtime": 1.7662, + "eval_samples_per_second": 2.265, + "eval_steps_per_second": 2.265, + "eval_token_acc": 0.7532467532467533, + "step": 460 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.5336225628852844, + "learning_rate": 5.913915949078452e-05, + "loss": 0.12927284240722656, + "memory(GiB)": 194.42, + "step": 465, + "token_acc": 0.9099099099099099, + "train_speed(iter/s)": 0.210062 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.6900014281272888, + "learning_rate": 5.831646772915651e-05, + "loss": 0.08137755393981934, + "memory(GiB)": 194.42, + "step": 470, + "token_acc": 0.9656623081296191, + "train_speed(iter/s)": 0.209705 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 0.8796831369400024, + "learning_rate": 5.749145369680407e-05, + "loss": 0.10649137496948242, + "memory(GiB)": 194.42, + "step": 475, + "token_acc": 0.9598332701780977, + "train_speed(iter/s)": 0.210106 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.33510109782218933, + "learning_rate": 5.666434776868895e-05, + "loss": 0.17189998626708985, + "memory(GiB)": 194.42, + "step": 480, + "token_acc": 0.9404330609149886, + "train_speed(iter/s)": 0.209154 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.3011992573738098, + "eval_runtime": 1.7649, + "eval_samples_per_second": 2.266, + "eval_steps_per_second": 2.266, + "eval_token_acc": 0.7582417582417582, + "step": 480 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.720079243183136, + "learning_rate": 5.583538090390882e-05, + "loss": 0.0982659637928009, + "memory(GiB)": 194.42, + "step": 485, + "token_acc": 0.9170192759755524, + "train_speed(iter/s)": 0.209559 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.5884720087051392, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.19064462184906006, + "memory(GiB)": 194.42, + "step": 490, + "token_acc": 0.9257759784075573, + "train_speed(iter/s)": 0.209499 + }, + { + "epoch": 2.5, + "grad_norm": 0.33910003304481506, + "learning_rate": 5.41727907343245e-05, + "loss": 0.10975323915481568, + "memory(GiB)": 194.42, + "step": 495, + "token_acc": 0.958795231123196, + "train_speed(iter/s)": 0.209984 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.5363466143608093, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.17116068601608275, + "memory(GiB)": 194.42, + "step": 500, + "token_acc": 0.9163900944600459, + "train_speed(iter/s)": 0.209844 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 0.2906345725059509, + "eval_runtime": 1.8092, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7522477522477522, + "step": 500 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.6032365560531616, + "learning_rate": 5.250554008935596e-05, + "loss": 0.09048279523849487, + "memory(GiB)": 194.42, + "step": 505, + "token_acc": 0.9289609432571849, + "train_speed(iter/s)": 0.209717 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.6278122067451477, + "learning_rate": 5.167074885038373e-05, + "loss": 0.09625995755195618, + "memory(GiB)": 194.42, + "step": 510, + "token_acc": 0.9664948453608248, + "train_speed(iter/s)": 0.210284 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.38628754019737244, + "learning_rate": 5.0835491075465045e-05, + "loss": 0.13246564865112304, + "memory(GiB)": 194.42, + "step": 515, + "token_acc": 0.9501214574898785, + "train_speed(iter/s)": 0.210549 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.465506374835968, + "learning_rate": 5e-05, + "loss": 0.13336524963378907, + "memory(GiB)": 194.42, + "step": 520, + "token_acc": 0.942733657482442, + "train_speed(iter/s)": 0.210504 + }, + { + "epoch": 2.6262626262626263, + "eval_loss": 0.2984510660171509, + "eval_runtime": 1.8075, + "eval_samples_per_second": 2.213, + "eval_steps_per_second": 2.213, + "eval_token_acc": 0.7502497502497503, + "step": 520 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.4542198181152344, + "learning_rate": 4.916450892453495e-05, + "loss": 0.09229624271392822, + "memory(GiB)": 194.42, + "step": 525, + "token_acc": 0.931699604743083, + "train_speed(iter/s)": 0.210556 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.3877967596054077, + "learning_rate": 4.832925114961629e-05, + "loss": 0.16157912015914916, + "memory(GiB)": 194.42, + "step": 530, + "token_acc": 0.9453226706341826, + "train_speed(iter/s)": 0.209926 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.7550373077392578, + "learning_rate": 4.749445991064404e-05, + "loss": 0.09830494523048401, + "memory(GiB)": 194.42, + "step": 535, + "token_acc": 0.9667737290951379, + "train_speed(iter/s)": 0.209454 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.8958114981651306, + "learning_rate": 4.666036831274392e-05, + "loss": 0.21627907752990722, + "memory(GiB)": 194.42, + "step": 540, + "token_acc": 0.91897499740637, + "train_speed(iter/s)": 0.20902 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.29240885376930237, + "eval_runtime": 1.8087, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7582417582417582, + "step": 540 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.6230780482292175, + "learning_rate": 4.582720926567552e-05, + "loss": 0.15035767555236818, + "memory(GiB)": 194.42, + "step": 545, + "token_acc": 0.9105213715359324, + "train_speed(iter/s)": 0.208676 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.2731610834598541, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.10666660070419312, + "memory(GiB)": 194.42, + "step": 550, + "token_acc": 0.9625537139349294, + "train_speed(iter/s)": 0.20872 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.5083901286125183, + "learning_rate": 4.416461909609119e-05, + "loss": 0.092490154504776, + "memory(GiB)": 194.42, + "step": 555, + "token_acc": 0.9688758129451843, + "train_speed(iter/s)": 0.208968 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.3543936014175415, + "learning_rate": 4.333565223131107e-05, + "loss": 0.12382739782333374, + "memory(GiB)": 194.42, + "step": 560, + "token_acc": 0.9427140588738243, + "train_speed(iter/s)": 0.208825 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.28748947381973267, + "eval_runtime": 1.8169, + "eval_samples_per_second": 2.202, + "eval_steps_per_second": 2.202, + "eval_token_acc": 0.7542457542457542, + "step": 560 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.6817888617515564, + "learning_rate": 4.250854630319593e-05, + "loss": 0.14281988143920898, + "memory(GiB)": 194.42, + "step": 565, + "token_acc": 0.9215728176087189, + "train_speed(iter/s)": 0.208389 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.6411938071250916, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.19629952907562256, + "memory(GiB)": 194.42, + "step": 570, + "token_acc": 0.9246951219512195, + "train_speed(iter/s)": 0.208033 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.7665055990219116, + "learning_rate": 4.0860840509215496e-05, + "loss": 0.1060869812965393, + "memory(GiB)": 194.42, + "step": 575, + "token_acc": 0.9567567567567568, + "train_speed(iter/s)": 0.207971 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 2.561556100845337, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.1460867166519165, + "memory(GiB)": 194.42, + "step": 580, + "token_acc": 0.9379691821414461, + "train_speed(iter/s)": 0.208359 + }, + { + "epoch": 2.929292929292929, + "eval_loss": 0.2931186854839325, + "eval_runtime": 1.8095, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 2.211, + "eval_token_acc": 0.7532467532467533, + "step": 580 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.7562662363052368, + "learning_rate": 3.922334199150432e-05, + "loss": 0.14551491737365724, + "memory(GiB)": 194.42, + "step": 585, + "token_acc": 0.907440654298823, + "train_speed(iter/s)": 0.208553 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.7358995676040649, + "learning_rate": 3.840899248662358e-05, + "loss": 0.14465657472610474, + "memory(GiB)": 194.42, + "step": 590, + "token_acc": 0.9507758159443552, + "train_speed(iter/s)": 0.208115 + }, + { + "epoch": 3.005050505050505, + "grad_norm": 0.2242085188627243, + "learning_rate": 3.7597879627190334e-05, + "loss": 0.12801222801208495, + "memory(GiB)": 194.42, + "step": 595, + "token_acc": 0.9628496042216359, + "train_speed(iter/s)": 0.207756 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.5652392506599426, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.05944470763206482, + "memory(GiB)": 194.42, + "step": 600, + "token_acc": 0.9819713314615044, + "train_speed(iter/s)": 0.207893 + }, + { + "epoch": 3.0303030303030303, + "eval_loss": 0.30186352133750916, + "eval_runtime": 1.8433, + "eval_samples_per_second": 2.17, + "eval_steps_per_second": 2.17, + "eval_token_acc": 0.7572427572427572, + "step": 600 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.6330710649490356, + "learning_rate": 3.598626885055219e-05, + "loss": 0.04340478777885437, + "memory(GiB)": 194.42, + "step": 605, + "token_acc": 0.9578030154689642, + "train_speed(iter/s)": 0.207185 + }, + { + "epoch": 3.080808080808081, + "grad_norm": 0.2928420305252075, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.019147023558616638, + "memory(GiB)": 194.42, + "step": 610, + "token_acc": 0.9906427990235964, + "train_speed(iter/s)": 0.207564 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.22657504677772522, + "learning_rate": 3.4390309625410686e-05, + "loss": 0.01184888556599617, + "memory(GiB)": 194.42, + "step": 615, + "token_acc": 0.9950428120775124, + "train_speed(iter/s)": 0.207984 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 0.38536500930786133, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.03439113795757294, + "memory(GiB)": 194.42, + "step": 620, + "token_acc": 0.9885694884563151, + "train_speed(iter/s)": 0.207846 + }, + { + "epoch": 3.1313131313131315, + "eval_loss": 0.3092794716358185, + "eval_runtime": 1.8078, + "eval_samples_per_second": 2.213, + "eval_steps_per_second": 2.213, + "eval_token_acc": 0.7482517482517482, + "step": 620 + }, + { + "epoch": 3.1565656565656566, + "grad_norm": 0.4387037754058838, + "learning_rate": 3.281178443479852e-05, + "loss": 0.033835414052009585, + "memory(GiB)": 194.42, + "step": 625, + "token_acc": 0.9644806032344478, + "train_speed(iter/s)": 0.207235 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.5069209337234497, + "learning_rate": 3.202961135812437e-05, + "loss": 0.04778254330158234, + "memory(GiB)": 194.43, + "step": 630, + "token_acc": 0.9859832635983263, + "train_speed(iter/s)": 0.207611 + }, + { + "epoch": 3.207070707070707, + "grad_norm": 0.7499670386314392, + "learning_rate": 3.1252456290153954e-05, + "loss": 0.06957237720489502, + "memory(GiB)": 194.43, + "step": 635, + "token_acc": 0.9698315118397086, + "train_speed(iter/s)": 0.207401 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.4098314642906189, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.06834298372268677, + "memory(GiB)": 194.43, + "step": 640, + "token_acc": 0.9677576941866145, + "train_speed(iter/s)": 0.206982 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.3071475625038147, + "eval_runtime": 1.8289, + "eval_samples_per_second": 2.187, + "eval_steps_per_second": 2.187, + "eval_token_acc": 0.7492507492507493, + "step": 640 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.6657339930534363, + "learning_rate": 2.9714066762261823e-05, + "loss": 0.053527307510375974, + "memory(GiB)": 194.43, + "step": 645, + "token_acc": 0.9555773168343275, + "train_speed(iter/s)": 0.206307 + }, + { + "epoch": 3.282828282828283, + "grad_norm": 0.35414841771125793, + "learning_rate": 2.895326187858326e-05, + "loss": 0.058514750003814696, + "memory(GiB)": 194.43, + "step": 650, + "token_acc": 0.980971797485559, + "train_speed(iter/s)": 0.206557 + }, + { + "epoch": 3.308080808080808, + "grad_norm": 0.41236743330955505, + "learning_rate": 2.8198334036140874e-05, + "loss": 0.025739893317222595, + "memory(GiB)": 194.43, + "step": 655, + "token_acc": 0.9937519525148392, + "train_speed(iter/s)": 0.206671 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5106225609779358, + "learning_rate": 2.74494940391949e-05, + "loss": 0.04316897392272949, + "memory(GiB)": 194.43, + "step": 660, + "token_acc": 0.9850356294536817, + "train_speed(iter/s)": 0.206584 + }, + { + "epoch": 3.3333333333333335, + "eval_loss": 0.30926382541656494, + "eval_runtime": 1.7766, + "eval_samples_per_second": 2.251, + "eval_steps_per_second": 2.251, + "eval_token_acc": 0.7462537462537463, + "step": 660 + }, + { + "epoch": 3.3585858585858586, + "grad_norm": 0.7511043548583984, + "learning_rate": 2.6706950992050094e-05, + "loss": 0.044805902242660525, + "memory(GiB)": 194.43, + "step": 665, + "token_acc": 0.9384410139127121, + "train_speed(iter/s)": 0.206749 + }, + { + "epoch": 3.3838383838383836, + "grad_norm": 0.7061121463775635, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.04475100636482239, + "memory(GiB)": 194.43, + "step": 670, + "token_acc": 0.9816687737041719, + "train_speed(iter/s)": 0.206885 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.35980212688446045, + "learning_rate": 2.5241583314757327e-05, + "loss": 0.036676472425460814, + "memory(GiB)": 194.43, + "step": 675, + "token_acc": 0.9836784836784837, + "train_speed(iter/s)": 0.207096 + }, + { + "epoch": 3.4343434343434343, + "grad_norm": 0.49686399102211, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.05962592363357544, + "memory(GiB)": 194.43, + "step": 680, + "token_acc": 0.9733040775278844, + "train_speed(iter/s)": 0.207419 + }, + { + "epoch": 3.4343434343434343, + "eval_loss": 0.30547747015953064, + "eval_runtime": 1.7648, + "eval_samples_per_second": 2.267, + "eval_steps_per_second": 2.267, + "eval_token_acc": 0.7422577422577422, + "step": 680 + }, + { + "epoch": 3.45959595959596, + "grad_norm": 0.36548638343811035, + "learning_rate": 2.3803867633181574e-05, + "loss": 0.07604837417602539, + "memory(GiB)": 194.43, + "step": 685, + "token_acc": 0.9316927830500993, + "train_speed(iter/s)": 0.207066 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.9540855288505554, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.05938977003097534, + "memory(GiB)": 194.43, + "step": 690, + "token_acc": 0.9727115716753022, + "train_speed(iter/s)": 0.206879 + }, + { + "epoch": 3.51010101010101, + "grad_norm": 0.49375462532043457, + "learning_rate": 2.2395409692487175e-05, + "loss": 0.05741435885429382, + "memory(GiB)": 194.43, + "step": 695, + "token_acc": 0.9801678108314263, + "train_speed(iter/s)": 0.207044 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.1865593045949936, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.013945281505584717, + "memory(GiB)": 194.43, + "step": 700, + "token_acc": 0.9956945388624519, + "train_speed(iter/s)": 0.207418 + }, + { + "epoch": 3.5353535353535355, + "eval_loss": 0.30903568863868713, + "eval_runtime": 1.7685, + "eval_samples_per_second": 2.262, + "eval_steps_per_second": 2.262, + "eval_token_acc": 0.7432567432567433, + "step": 700 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.2819386422634125, + "learning_rate": 2.1017782560671123e-05, + "loss": 0.03774779438972473, + "memory(GiB)": 194.43, + "step": 705, + "token_acc": 0.939033597583994, + "train_speed(iter/s)": 0.207569 + }, + { + "epoch": 3.5858585858585856, + "grad_norm": 0.43836522102355957, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.03159322738647461, + "memory(GiB)": 194.43, + "step": 710, + "token_acc": 0.9888877240800923, + "train_speed(iter/s)": 0.207322 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.6935763955116272, + "learning_rate": 1.967252487164663e-05, + "loss": 0.002558489516377449, + "memory(GiB)": 194.43, + "step": 715, + "token_acc": 0.9991300565463245, + "train_speed(iter/s)": 0.20803 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.7125621438026428, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.01584031879901886, + "memory(GiB)": 194.43, + "step": 720, + "token_acc": 0.9957315627223144, + "train_speed(iter/s)": 0.207895 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.3177046775817871, + "eval_runtime": 1.7785, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 2.249, + "eval_token_acc": 0.7432567432567433, + "step": 720 + }, + { + "epoch": 3.6616161616161618, + "grad_norm": 0.4679706394672394, + "learning_rate": 1.836113910678507e-05, + "loss": 0.03762938678264618, + "memory(GiB)": 194.43, + "step": 725, + "token_acc": 0.955343466478143, + "train_speed(iter/s)": 0.207656 + }, + { + "epoch": 3.686868686868687, + "grad_norm": 0.318162739276886, + "learning_rate": 1.771860742205988e-05, + "loss": 0.05497429370880127, + "memory(GiB)": 194.43, + "step": 730, + "token_acc": 0.9796425024826216, + "train_speed(iter/s)": 0.207349 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.3959096372127533, + "learning_rate": 1.7085089916835923e-05, + "loss": 0.035741007328033446, + "memory(GiB)": 194.43, + "step": 735, + "token_acc": 0.9839179435228568, + "train_speed(iter/s)": 0.207241 + }, + { + "epoch": 3.7373737373737375, + "grad_norm": 0.6112183332443237, + "learning_rate": 1.646076349303884e-05, + "loss": 0.045097559690475464, + "memory(GiB)": 194.43, + "step": 740, + "token_acc": 0.987012987012987, + "train_speed(iter/s)": 0.207546 + }, + { + "epoch": 3.7373737373737375, + "eval_loss": 0.31488630175590515, + "eval_runtime": 1.8107, + "eval_samples_per_second": 2.209, + "eval_steps_per_second": 2.209, + "eval_token_acc": 0.7482517482517482, + "step": 740 + }, + { + "epoch": 3.7626262626262625, + "grad_norm": 0.2885132431983948, + "learning_rate": 1.584580248609846e-05, + "loss": 0.03688704967498779, + "memory(GiB)": 194.43, + "step": 745, + "token_acc": 0.9172789115646258, + "train_speed(iter/s)": 0.207875 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.7163310050964355, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.034329149127006534, + "memory(GiB)": 194.45, + "step": 750, + "token_acc": 0.9898200757575758, + "train_speed(iter/s)": 0.208273 + }, + { + "epoch": 3.813131313131313, + "grad_norm": 0.4935724139213562, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.11100113391876221, + "memory(GiB)": 194.45, + "step": 755, + "token_acc": 0.9534293604000299, + "train_speed(iter/s)": 0.207493 + }, + { + "epoch": 3.8383838383838382, + "grad_norm": 0.6098226308822632, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.011808309704065323, + "memory(GiB)": 194.45, + "step": 760, + "token_acc": 0.995991448423303, + "train_speed(iter/s)": 0.208006 + }, + { + "epoch": 3.8383838383838382, + "eval_loss": 0.3122360408306122, + "eval_runtime": 1.7841, + "eval_samples_per_second": 2.242, + "eval_steps_per_second": 2.242, + "eval_token_acc": 0.7452547452547452, + "step": 760 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.4670480489730835, + "learning_rate": 1.3483006802566544e-05, + "loss": 0.05923340916633606, + "memory(GiB)": 194.45, + "step": 765, + "token_acc": 0.9374712643678161, + "train_speed(iter/s)": 0.207991 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.22090964019298553, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.03186772465705871, + "memory(GiB)": 194.45, + "step": 770, + "token_acc": 0.9895613272026842, + "train_speed(iter/s)": 0.207996 + }, + { + "epoch": 3.9141414141414144, + "grad_norm": 0.48269495368003845, + "learning_rate": 1.2362137491387432e-05, + "loss": 0.03672673106193543, + "memory(GiB)": 194.45, + "step": 775, + "token_acc": 0.9879902705989663, + "train_speed(iter/s)": 0.207983 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.4281235635280609, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.047962135076522826, + "memory(GiB)": 194.45, + "step": 780, + "token_acc": 0.980590717299578, + "train_speed(iter/s)": 0.207969 + }, + { + "epoch": 3.9393939393939394, + "eval_loss": 0.3114301860332489, + "eval_runtime": 1.7899, + "eval_samples_per_second": 2.235, + "eval_steps_per_second": 2.235, + "eval_token_acc": 0.7472527472527473, + "step": 780 + }, + { + "epoch": 3.9646464646464645, + "grad_norm": 0.45910346508026123, + "learning_rate": 1.1283304875289336e-05, + "loss": 0.03378086090087891, + "memory(GiB)": 194.45, + "step": 785, + "token_acc": 0.9337579617834395, + "train_speed(iter/s)": 0.208066 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 0.5526273250579834, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.0462653785943985, + "memory(GiB)": 194.45, + "step": 790, + "token_acc": 0.9858657243816255, + "train_speed(iter/s)": 0.208239 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.28537100553512573, + "learning_rate": 1.024771387279585e-05, + "loss": 0.033093854784965515, + "memory(GiB)": 194.45, + "step": 795, + "token_acc": 0.9894450663681433, + "train_speed(iter/s)": 0.208413 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 0.26249536871910095, + "learning_rate": 9.746497343621857e-06, + "loss": 0.008343618363142014, + "memory(GiB)": 194.45, + "step": 800, + "token_acc": 0.9987666164177059, + "train_speed(iter/s)": 0.208448 + }, + { + "epoch": 4.040404040404041, + "eval_loss": 0.31291159987449646, + "eval_runtime": 1.8149, + "eval_samples_per_second": 2.204, + "eval_steps_per_second": 2.204, + "eval_token_acc": 0.7442557442557443, + "step": 800 + }, + { + "epoch": 4.065656565656566, + "grad_norm": 0.37422114610671997, + "learning_rate": 9.256521107059834e-06, + "loss": 0.01603064388036728, + "memory(GiB)": 194.45, + "step": 805, + "token_acc": 0.962771327612317, + "train_speed(iter/s)": 0.208215 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.01700465753674507, + "learning_rate": 8.777921982911996e-06, + "loss": 0.0026650244370102884, + "memory(GiB)": 194.45, + "step": 810, + "token_acc": 0.9998035749361619, + "train_speed(iter/s)": 0.208521 + }, + { + "epoch": 4.116161616161616, + "grad_norm": 0.3416445255279541, + "learning_rate": 8.310833614062651e-06, + "loss": 0.017160463333129882, + "memory(GiB)": 194.45, + "step": 815, + "token_acc": 0.9931600547195623, + "train_speed(iter/s)": 0.208645 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.025509856641292572, + "learning_rate": 7.85538642916015e-06, + "loss": 0.007264973968267441, + "memory(GiB)": 194.45, + "step": 820, + "token_acc": 0.9985950671245707, + "train_speed(iter/s)": 0.208653 + }, + { + "epoch": 4.141414141414141, + "eval_loss": 0.31731295585632324, + "eval_runtime": 1.7956, + "eval_samples_per_second": 2.228, + "eval_steps_per_second": 2.228, + "eval_token_acc": 0.7422577422577422, + "step": 820 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.27206850051879883, + "learning_rate": 7.4117076061961885e-06, + "loss": 0.02214038074016571, + "memory(GiB)": 194.45, + "step": 825, + "token_acc": 0.9670045287901661, + "train_speed(iter/s)": 0.208283 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 0.3412053883075714, + "learning_rate": 6.979921036993042e-06, + "loss": 0.04925653636455536, + "memory(GiB)": 194.45, + "step": 830, + "token_acc": 0.9612062655540916, + "train_speed(iter/s)": 0.208304 + }, + { + "epoch": 4.217171717171717, + "grad_norm": 0.3712901771068573, + "learning_rate": 6.5601472926081766e-06, + "loss": 0.025851538777351378, + "memory(GiB)": 194.45, + "step": 835, + "token_acc": 0.9915025106218617, + "train_speed(iter/s)": 0.208305 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.31404536962509155, + "learning_rate": 6.152503589666425e-06, + "loss": 0.027274680137634278, + "memory(GiB)": 194.45, + "step": 840, + "token_acc": 0.9867660142348754, + "train_speed(iter/s)": 0.208162 + }, + { + "epoch": 4.242424242424242, + "eval_loss": 0.32319602370262146, + "eval_runtime": 1.7959, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 2.227, + "eval_token_acc": 0.7432567432567433, + "step": 840 + }, + { + "epoch": 4.267676767676767, + "grad_norm": 0.7435818910598755, + "learning_rate": 5.757103757628573e-06, + "loss": 0.07043209075927734, + "memory(GiB)": 194.45, + "step": 845, + "token_acc": 0.9483071053886505, + "train_speed(iter/s)": 0.207659 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 0.396167129278183, + "learning_rate": 5.374058207005944e-06, + "loss": 0.0496614933013916, + "memory(GiB)": 194.45, + "step": 850, + "token_acc": 0.9817774610607757, + "train_speed(iter/s)": 0.207405 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.12084542959928513, + "learning_rate": 5.0034738985296095e-06, + "loss": 0.01363416612148285, + "memory(GiB)": 194.45, + "step": 855, + "token_acc": 0.9948678750818956, + "train_speed(iter/s)": 0.207257 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 0.1820814311504364, + "learning_rate": 4.645454313282965e-06, + "loss": 0.010331088304519653, + "memory(GiB)": 194.45, + "step": 860, + "token_acc": 0.9975838926174496, + "train_speed(iter/s)": 0.207206 + }, + { + "epoch": 4.343434343434343, + "eval_loss": 0.328104168176651, + "eval_runtime": 1.824, + "eval_samples_per_second": 2.193, + "eval_steps_per_second": 2.193, + "eval_token_acc": 0.7452547452547452, + "step": 860 + }, + { + "epoch": 4.3686868686868685, + "grad_norm": 0.1977299004793167, + "learning_rate": 4.3000994238058644e-06, + "loss": 0.009320738911628722, + "memory(GiB)": 194.45, + "step": 865, + "token_acc": 0.9639168343393696, + "train_speed(iter/s)": 0.207088 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.20129217207431793, + "learning_rate": 3.967505666178556e-06, + "loss": 0.007078251987695694, + "memory(GiB)": 194.45, + "step": 870, + "token_acc": 0.9981830194912454, + "train_speed(iter/s)": 0.207247 + }, + { + "epoch": 4.41919191919192, + "grad_norm": 0.15680421888828278, + "learning_rate": 3.647765913093132e-06, + "loss": 0.006438987702131272, + "memory(GiB)": 194.45, + "step": 875, + "token_acc": 0.9970992071166118, + "train_speed(iter/s)": 0.207512 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.12446325272321701, + "learning_rate": 3.340969447919873e-06, + "loss": 0.01718273162841797, + "memory(GiB)": 194.45, + "step": 880, + "token_acc": 0.9943947886683836, + "train_speed(iter/s)": 0.207596 + }, + { + "epoch": 4.444444444444445, + "eval_loss": 0.3302783668041229, + "eval_runtime": 1.8139, + "eval_samples_per_second": 2.205, + "eval_steps_per_second": 2.205, + "eval_token_acc": 0.7432567432567433, + "step": 880 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.21941973268985748, + "learning_rate": 3.0472019397761064e-06, + "loss": 0.0076537981629371645, + "memory(GiB)": 194.45, + "step": 885, + "token_acc": 0.9654152738493186, + "train_speed(iter/s)": 0.207479 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 0.4180464744567871, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.013453680276870727, + "memory(GiB)": 194.45, + "step": 890, + "token_acc": 0.9971350613915416, + "train_speed(iter/s)": 0.207427 + }, + { + "epoch": 4.52020202020202, + "grad_norm": 0.36694836616516113, + "learning_rate": 2.4990782572647975e-06, + "loss": 0.022955694794654848, + "memory(GiB)": 194.45, + "step": 895, + "token_acc": 0.9932196424902404, + "train_speed(iter/s)": 0.207683 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.3065175712108612, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.03834371864795685, + "memory(GiB)": 194.45, + "step": 900, + "token_acc": 0.9876448720752241, + "train_speed(iter/s)": 0.207506 + }, + { + "epoch": 4.545454545454545, + "eval_loss": 0.33111608028411865, + "eval_runtime": 1.8047, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 2.216, + "eval_token_acc": 0.7402597402597403, + "step": 900 + }, + { + "epoch": 4.570707070707071, + "grad_norm": 0.06433413922786713, + "learning_rate": 2.004007049848461e-06, + "loss": 0.0027513707056641577, + "memory(GiB)": 194.45, + "step": 905, + "token_acc": 0.9582614153459909, + "train_speed(iter/s)": 0.207542 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 0.08144180476665497, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.007540231198072433, + "memory(GiB)": 194.45, + "step": 910, + "token_acc": 0.9984427718660783, + "train_speed(iter/s)": 0.207932 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.014387480914592743, + "learning_rate": 1.5625412489637337e-06, + "loss": 0.01742658168077469, + "memory(GiB)": 194.45, + "step": 915, + "token_acc": 0.9934481182386103, + "train_speed(iter/s)": 0.208031 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 0.29279810190200806, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.005430782586336136, + "memory(GiB)": 194.45, + "step": 920, + "token_acc": 0.9978392394122731, + "train_speed(iter/s)": 0.208309 + }, + { + "epoch": 4.646464646464646, + "eval_loss": 0.3334895372390747, + "eval_runtime": 1.796, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 2.227, + "eval_token_acc": 0.7372627372627373, + "step": 920 + }, + { + "epoch": 4.671717171717171, + "grad_norm": 0.14231210947036743, + "learning_rate": 1.1751739156407649e-06, + "loss": 0.002384462393820286, + "memory(GiB)": 194.45, + "step": 925, + "token_acc": 0.9558604728054224, + "train_speed(iter/s)": 0.208319 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.0057262247428298, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.014989945292472839, + "memory(GiB)": 194.45, + "step": 930, + "token_acc": 0.9928400954653938, + "train_speed(iter/s)": 0.208549 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.4201620817184448, + "learning_rate": 8.423376898168245e-07, + "loss": 0.02435753494501114, + "memory(GiB)": 194.45, + "step": 935, + "token_acc": 0.9895142941932931, + "train_speed(iter/s)": 0.208233 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 0.15343283116817474, + "learning_rate": 6.964873004985717e-07, + "loss": 0.015529252588748932, + "memory(GiB)": 194.45, + "step": 940, + "token_acc": 0.9936803592216863, + "train_speed(iter/s)": 0.208371 + }, + { + "epoch": 4.747474747474747, + "eval_loss": 0.3332846760749817, + "eval_runtime": 1.8041, + "eval_samples_per_second": 2.217, + "eval_steps_per_second": 2.217, + "eval_token_acc": 0.7412587412587412, + "step": 940 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.08844652026891708, + "learning_rate": 5.644043071326932e-07, + "loss": 0.012052442133426666, + "memory(GiB)": 194.45, + "step": 945, + "token_acc": 0.9724211084592946, + "train_speed(iter/s)": 0.207774 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 0.014770667999982834, + "learning_rate": 4.461255922609986e-07, + "loss": 0.010218892991542817, + "memory(GiB)": 194.45, + "step": 950, + "token_acc": 0.9942298445263664, + "train_speed(iter/s)": 0.207847 + }, + { + "epoch": 4.8232323232323235, + "grad_norm": 0.14186975359916687, + "learning_rate": 3.416841837512952e-07, + "loss": 0.003109816461801529, + "memory(GiB)": 194.45, + "step": 955, + "token_acc": 0.9989392734022806, + "train_speed(iter/s)": 0.207847 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.3466111421585083, + "learning_rate": 2.511092455747932e-07, + "loss": 0.027083656191825865, + "memory(GiB)": 194.45, + "step": 960, + "token_acc": 0.9918020343100046, + "train_speed(iter/s)": 0.207931 + }, + { + "epoch": 4.848484848484849, + "eval_loss": 0.33344024419784546, + "eval_runtime": 1.7787, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 2.249, + "eval_token_acc": 0.7412587412587412, + "step": 960 + }, + { + "epoch": 4.873737373737374, + "grad_norm": 0.18189793825149536, + "learning_rate": 1.7442606966242004e-07, + "loss": 0.0134581059217453, + "memory(GiB)": 194.45, + "step": 965, + "token_acc": 0.9534619750283768, + "train_speed(iter/s)": 0.207945 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 0.23113778233528137, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.008723243325948715, + "memory(GiB)": 194.45, + "step": 970, + "token_acc": 0.9973284354650191, + "train_speed(iter/s)": 0.20813 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.0857037901878357, + "learning_rate": 6.281677086071303e-08, + "loss": 0.005283674597740174, + "memory(GiB)": 194.45, + "step": 975, + "token_acc": 0.9985783915515841, + "train_speed(iter/s)": 0.208414 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 0.053703807294368744, + "learning_rate": 2.792181348726941e-08, + "loss": 0.035409435629844666, + "memory(GiB)": 194.45, + "step": 980, + "token_acc": 0.9834126862233143, + "train_speed(iter/s)": 0.208454 + }, + { + "epoch": 4.94949494949495, + "eval_loss": 0.33373889327049255, + "eval_runtime": 1.7669, + "eval_samples_per_second": 2.264, + "eval_steps_per_second": 2.264, + "eval_token_acc": 0.7372627372627373, + "step": 980 + }, + { + "epoch": 4.974747474747475, + "grad_norm": 0.2665289044380188, + "learning_rate": 6.980940707146389e-09, + "loss": 0.06211912035942078, + "memory(GiB)": 194.45, + "step": 985, + "token_acc": 0.9500478265490487, + "train_speed(iter/s)": 0.208152 + }, + { + "epoch": 5.0, + "grad_norm": 0.14736372232437134, + "learning_rate": 0.0, + "loss": 0.003582773730158806, + "memory(GiB)": 194.45, + "step": 990, + "token_acc": 0.9995134609146935, + "train_speed(iter/s)": 0.208263 + }, + { + "epoch": 5.0, + "eval_loss": 0.3324023485183716, + "eval_runtime": 1.7947, + "eval_samples_per_second": 2.229, + "eval_steps_per_second": 2.229, + "eval_token_acc": 0.7432567432567433, + "step": 990 + } + ], + "logging_steps": 5, + "max_steps": 990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.310258299632026e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/training_args.bin b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2ade5288a82aa26858fccdfbb4e928d25b606382 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea0b2f710093e3c5b35669dba40183b02dbc7c28db39c667ac4476d3bbfefe6 +size 5944 diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..e98d80bf4d36270eee328f6304dd23e84a585fca Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_runtime.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..60db11d7df8daa75065e8d44c4a7484726a627d4 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_samples_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..454858f777b5334ecf59e2bd2e92f411a7f36445 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_steps_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b91991d0eb908a1d8342b97ad00abaa68fab1dcc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_token_acc.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..a5b312b7bffbd51baea37f133dc07071fada27ca Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/eval_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_epoch.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..28af4713ddf3ee940ab6789f15d73de984b4c8ea Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_epoch.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_grad_norm.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..f3411c8e1b3b03262fded63b75ccf388b0176e58 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_grad_norm.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_learning_rate.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..8977f64dd4afd86ccf4d0ec848be41268d0e90fc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_learning_rate.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c9c5878253dee75fb405f318fe0aa8e087020378 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_memory(GiB).png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..d579d9aa23cbaa6a57a4b7e12ecd7064d5343868 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_memory(GiB).png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_token_acc.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..563f8b4299b531aee27cfa003f6b7207948fbc64 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_token_acc.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_total_flos.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..971fad7c8b2843b000a848f086d88316941b5af2 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_total_flos.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_loss.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..e213fa8f1e7ed426d1e0fb760b23aab58d677ffc Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_loss.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_runtime.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..682092a7347d4ff5c33246d533bdc5888f194f44 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_runtime.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_samples_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..266a64173698034c3a1484299f64ccdeec77da48 Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_samples_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_speed(iter_s).png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..2f6f23ba0e1f4716939901ea60b8680448578ead Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_speed(iter_s).png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_steps_per_second.png b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..27a113bc1f9997780aba3551b2268c7f8ef490fa Binary files /dev/null and b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/images/train_train_steps_per_second.png differ diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/logging.jsonl b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..329b03e7b533006bb532bd7204a51c879225e9f5 --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/logging.jsonl @@ -0,0 +1,251 @@ +{"loss": 0.58450192, "token_acc": 0.84862385, "grad_norm": 0.52126497, "learning_rate": 2e-06, "memory(GiB)": 143.92, "train_speed(iter/s)": 0.152386, "epoch": 0.00505051, "global_step/max_steps": "1/990", "percentage": "0.10%", "elapsed_time": "6s", "remaining_time": "1h 41m 54s"} +{"loss": 0.76121569, "token_acc": 0.82903069, "grad_norm": 0.8231858, "learning_rate": 1e-05, "memory(GiB)": 153.24, "train_speed(iter/s)": 0.221696, "epoch": 0.02525253, "global_step/max_steps": "5/990", "percentage": "0.51%", "elapsed_time": "22s", "remaining_time": "1h 12m 48s"} +{"loss": 0.81032276, "token_acc": 0.78634967, "grad_norm": 0.61367083, "learning_rate": 2e-05, "memory(GiB)": 160.15, "train_speed(iter/s)": 0.228033, "epoch": 0.05050505, "global_step/max_steps": "10/990", "percentage": "1.01%", "elapsed_time": "43s", "remaining_time": "1h 11m 0s"} +{"loss": 0.7224256, "token_acc": 0.80023828, "grad_norm": 0.50656945, "learning_rate": 3e-05, "memory(GiB)": 169.88, "train_speed(iter/s)": 0.222881, "epoch": 0.07575758, "global_step/max_steps": "15/990", "percentage": "1.52%", "elapsed_time": "1m 6s", "remaining_time": "1h 12m 29s"} +{"loss": 0.78371954, "token_acc": 0.86508633, "grad_norm": 5.61674547, "learning_rate": 4e-05, "memory(GiB)": 169.88, "train_speed(iter/s)": 0.240866, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "1m 22s", "remaining_time": "1h 6m 48s"} +{"eval_loss": 1.15001416, "eval_token_acc": 0.72327672, "eval_runtime": 1.8151, "eval_samples_per_second": 2.204, "eval_steps_per_second": 2.204, "epoch": 0.1010101, "global_step/max_steps": "20/990", "percentage": "2.02%", "elapsed_time": "1m 24s", "remaining_time": "1h 8m 17s"} +{"loss": 0.51102223, "token_acc": 0.82794411, "grad_norm": 0.48052129, "learning_rate": 5e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.217821, "epoch": 0.12626263, "global_step/max_steps": "25/990", "percentage": "2.53%", "elapsed_time": "1m 54s", "remaining_time": "1h 13m 35s"} +{"loss": 0.57625828, "token_acc": 0.81287895, "grad_norm": 1.09942508, "learning_rate": 6e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.225877, "epoch": 0.15151515, "global_step/max_steps": "30/990", "percentage": "3.03%", "elapsed_time": "2m 12s", "remaining_time": "1h 10m 37s"} +{"loss": 0.37086082, "token_acc": 0.85262206, "grad_norm": 0.30022499, "learning_rate": 7e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.229602, "epoch": 0.17676768, "global_step/max_steps": "35/990", "percentage": "3.54%", "elapsed_time": "2m 32s", "remaining_time": "1h 9m 9s"} +{"loss": 0.4419909, "token_acc": 0.84672683, "grad_norm": 0.31309283, "learning_rate": 8e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.229371, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "2m 54s", "remaining_time": "1h 8m 52s"} +{"eval_loss": 0.52626723, "eval_token_acc": 0.73526474, "eval_runtime": 1.7976, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 0.2020202, "global_step/max_steps": "40/990", "percentage": "4.04%", "elapsed_time": "2m 55s", "remaining_time": "1h 9m 35s"} +{"loss": 0.44392233, "token_acc": 0.83723785, "grad_norm": 0.45766538, "learning_rate": 9e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.220221, "epoch": 0.22727273, "global_step/max_steps": "45/990", "percentage": "4.55%", "elapsed_time": "3m 23s", "remaining_time": "1h 11m 23s"} +{"loss": 0.52411561, "token_acc": 0.86690766, "grad_norm": 0.36840695, "learning_rate": 0.0001, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.215242, "epoch": 0.25252525, "global_step/max_steps": "50/990", "percentage": "5.05%", "elapsed_time": "3m 51s", "remaining_time": "1h 12m 40s"} +{"loss": 0.46913376, "token_acc": 0.84754415, "grad_norm": 0.41670883, "learning_rate": 9.999e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.215104, "epoch": 0.27777778, "global_step/max_steps": "55/990", "percentage": "5.56%", "elapsed_time": "4m 15s", "remaining_time": "1h 12m 20s"} +{"loss": 0.3627877, "token_acc": 0.85572414, "grad_norm": 0.46224689, "learning_rate": 9.997e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.220608, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "4m 31s", "remaining_time": "1h 10m 9s"} +{"eval_loss": 0.49902683, "eval_token_acc": 0.74025974, "eval_runtime": 1.7923, "eval_samples_per_second": 2.232, "eval_steps_per_second": 2.232, "epoch": 0.3030303, "global_step/max_steps": "60/990", "percentage": "6.06%", "elapsed_time": "4m 33s", "remaining_time": "1h 10m 37s"} +{"loss": 0.54401402, "token_acc": 0.82857143, "grad_norm": 0.65567118, "learning_rate": 9.994e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.215587, "epoch": 0.32828283, "global_step/max_steps": "65/990", "percentage": "6.57%", "elapsed_time": "5m 1s", "remaining_time": "1h 11m 25s"} +{"loss": 0.40032358, "token_acc": 0.87363495, "grad_norm": 0.27956635, "learning_rate": 9.989e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.213465, "epoch": 0.35353535, "global_step/max_steps": "70/990", "percentage": "7.07%", "elapsed_time": "5m 27s", "remaining_time": "1h 11m 44s"} +{"loss": 0.57152624, "token_acc": 0.8336414, "grad_norm": 0.4587284, "learning_rate": 9.983e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.217719, "epoch": 0.37878788, "global_step/max_steps": "75/990", "percentage": "7.58%", "elapsed_time": "5m 44s", "remaining_time": "1h 9m 58s"} +{"loss": 0.60025573, "token_acc": 0.83684761, "grad_norm": 0.39165273, "learning_rate": 9.975e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.218182, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "6m 6s", "remaining_time": "1h 9m 26s"} +{"eval_loss": 0.5162496, "eval_token_acc": 0.73926074, "eval_runtime": 1.8197, "eval_samples_per_second": 2.198, "eval_steps_per_second": 2.198, "epoch": 0.4040404, "global_step/max_steps": "80/990", "percentage": "8.08%", "elapsed_time": "6m 8s", "remaining_time": "1h 9m 47s"} +{"loss": 0.54492645, "token_acc": 0.81428571, "grad_norm": 0.27539632, "learning_rate": 9.966e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.214816, "epoch": 0.42929293, "global_step/max_steps": "85/990", "percentage": "8.59%", "elapsed_time": "6m 35s", "remaining_time": "1h 10m 8s"} +{"loss": 0.43953519, "token_acc": 0.84813654, "grad_norm": 0.63126928, "learning_rate": 9.955e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.216199, "epoch": 0.45454545, "global_step/max_steps": "90/990", "percentage": "9.09%", "elapsed_time": "6m 55s", "remaining_time": "1h 9m 19s"} +{"loss": 0.35303638, "token_acc": 0.86679124, "grad_norm": 0.33478057, "learning_rate": 9.944e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.215501, "epoch": 0.47979798, "global_step/max_steps": "95/990", "percentage": "9.60%", "elapsed_time": "7m 20s", "remaining_time": "1h 9m 9s"} +{"loss": 0.41626186, "token_acc": 0.86551491, "grad_norm": 0.2933507, "learning_rate": 9.93e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212789, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "7m 49s", "remaining_time": "1h 9m 39s"} +{"eval_loss": 0.51816493, "eval_token_acc": 0.74025974, "eval_runtime": 1.7982, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 0.50505051, "global_step/max_steps": "100/990", "percentage": "10.10%", "elapsed_time": "7m 51s", "remaining_time": "1h 9m 55s"} +{"loss": 0.54417596, "token_acc": 0.80285412, "grad_norm": 0.48549584, "learning_rate": 9.916e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212013, "epoch": 0.53030303, "global_step/max_steps": "105/990", "percentage": "10.61%", "elapsed_time": "8m 14s", "remaining_time": "1h 9m 31s"} +{"loss": 0.5125464, "token_acc": 0.80686695, "grad_norm": 1.27370763, "learning_rate": 9.9e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.216603, "epoch": 0.55555556, "global_step/max_steps": "110/990", "percentage": "11.11%", "elapsed_time": "8m 27s", "remaining_time": "1h 7m 39s"} +{"loss": 0.45648856, "token_acc": 0.86146286, "grad_norm": 0.24062683, "learning_rate": 9.882e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.213323, "epoch": 0.58080808, "global_step/max_steps": "115/990", "percentage": "11.62%", "elapsed_time": "8m 58s", "remaining_time": "1h 8m 18s"} +{"loss": 0.51087871, "token_acc": 0.8145441, "grad_norm": 0.76560038, "learning_rate": 9.864e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212345, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "9m 24s", "remaining_time": "1h 8m 14s"} +{"eval_loss": 0.49168646, "eval_token_acc": 0.74025974, "eval_runtime": 1.7863, "eval_samples_per_second": 2.239, "eval_steps_per_second": 2.239, "epoch": 0.60606061, "global_step/max_steps": "120/990", "percentage": "12.12%", "elapsed_time": "9m 26s", "remaining_time": "1h 8m 27s"} +{"loss": 0.46934419, "token_acc": 0.83522818, "grad_norm": 0.36274302, "learning_rate": 9.844e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212255, "epoch": 0.63131313, "global_step/max_steps": "125/990", "percentage": "12.63%", "elapsed_time": "9m 48s", "remaining_time": "1h 7m 52s"} +{"loss": 0.47632036, "token_acc": 0.84509056, "grad_norm": 0.39431232, "learning_rate": 9.822e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211377, "epoch": 0.65656566, "global_step/max_steps": "130/990", "percentage": "13.13%", "elapsed_time": "10m 14s", "remaining_time": "1h 7m 46s"} +{"loss": 0.36736994, "token_acc": 0.87311347, "grad_norm": 0.22374582, "learning_rate": 9.8e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211464, "epoch": 0.68181818, "global_step/max_steps": "135/990", "percentage": "13.64%", "elapsed_time": "10m 38s", "remaining_time": "1h 7m 20s"} +{"loss": 0.52923031, "token_acc": 0.82730349, "grad_norm": 0.3889102, "learning_rate": 9.776e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211083, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "11m 2s", "remaining_time": "1h 7m 4s"} +{"eval_loss": 0.43900543, "eval_token_acc": 0.74225774, "eval_runtime": 1.8334, "eval_samples_per_second": 2.182, "eval_steps_per_second": 2.182, "epoch": 0.70707071, "global_step/max_steps": "140/990", "percentage": "14.14%", "elapsed_time": "11m 4s", "remaining_time": "1h 7m 15s"} +{"loss": 0.34286373, "token_acc": 0.85965463, "grad_norm": 0.57052732, "learning_rate": 9.75e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.208774, "epoch": 0.73232323, "global_step/max_steps": "145/990", "percentage": "14.65%", "elapsed_time": "11m 34s", "remaining_time": "1h 7m 25s"} +{"loss": 0.40458231, "token_acc": 0.87242936, "grad_norm": 0.92507231, "learning_rate": 9.723e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.20966, "epoch": 0.75757576, "global_step/max_steps": "150/990", "percentage": "15.15%", "elapsed_time": "11m 55s", "remaining_time": "1h 6m 44s"} +{"loss": 0.42027054, "token_acc": 0.85642884, "grad_norm": 0.47075906, "learning_rate": 9.695e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209281, "epoch": 0.78282828, "global_step/max_steps": "155/990", "percentage": "15.66%", "elapsed_time": "12m 20s", "remaining_time": "1h 6m 27s"} +{"loss": 0.34715896, "token_acc": 0.88205693, "grad_norm": 0.42332849, "learning_rate": 9.666e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210051, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "12m 41s", "remaining_time": "1h 5m 49s"} +{"eval_loss": 0.42763793, "eval_token_acc": 0.74725275, "eval_runtime": 1.8048, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 0.80808081, "global_step/max_steps": "160/990", "percentage": "16.16%", "elapsed_time": "12m 43s", "remaining_time": "1h 5m 58s"} +{"loss": 0.41473951, "token_acc": 0.84523486, "grad_norm": 0.2773014, "learning_rate": 9.635e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209147, "epoch": 0.83333333, "global_step/max_steps": "165/990", "percentage": "16.67%", "elapsed_time": "13m 8s", "remaining_time": "1h 5m 42s"} +{"loss": 0.45830173, "token_acc": 0.85353602, "grad_norm": 0.40669981, "learning_rate": 9.603e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208696, "epoch": 0.85858586, "global_step/max_steps": "170/990", "percentage": "17.17%", "elapsed_time": "13m 34s", "remaining_time": "1h 5m 27s"} +{"loss": 0.41636529, "token_acc": 0.85295121, "grad_norm": 0.49174753, "learning_rate": 9.57e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209604, "epoch": 0.88383838, "global_step/max_steps": "175/990", "percentage": "17.68%", "elapsed_time": "13m 54s", "remaining_time": "1h 4m 46s"} +{"loss": 0.42295904, "token_acc": 0.86157518, "grad_norm": 0.34121031, "learning_rate": 9.535e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.21007, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "14m 16s", "remaining_time": "1h 4m 14s"} +{"eval_loss": 0.43593785, "eval_token_acc": 0.75524476, "eval_runtime": 1.7765, "eval_samples_per_second": 2.252, "eval_steps_per_second": 2.252, "epoch": 0.90909091, "global_step/max_steps": "180/990", "percentage": "18.18%", "elapsed_time": "14m 18s", "remaining_time": "1h 4m 22s"} +{"loss": 0.4941268, "token_acc": 0.82007051, "grad_norm": 0.48130041, "learning_rate": 9.5e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20925, "epoch": 0.93434343, "global_step/max_steps": "185/990", "percentage": "18.69%", "elapsed_time": "14m 43s", "remaining_time": "1h 4m 5s"} +{"loss": 0.7393961, "token_acc": 0.79909834, "grad_norm": 2.57511306, "learning_rate": 9.463e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210972, "epoch": 0.95959596, "global_step/max_steps": "190/990", "percentage": "19.19%", "elapsed_time": "15m 0s", "remaining_time": "1h 3m 10s"} +{"loss": 0.5489778, "token_acc": 0.81906874, "grad_norm": 0.27773517, "learning_rate": 9.424e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210161, "epoch": 0.98484848, "global_step/max_steps": "195/990", "percentage": "19.70%", "elapsed_time": "15m 27s", "remaining_time": "1h 3m 1s"} +{"loss": 0.47465434, "token_acc": 0.85632139, "grad_norm": 0.299546, "learning_rate": 9.385e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207474, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "16m 3s", "remaining_time": "1h 3m 26s"} +{"eval_loss": 0.47131014, "eval_token_acc": 0.75024975, "eval_runtime": 1.7784, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 1.01010101, "global_step/max_steps": "200/990", "percentage": "20.20%", "elapsed_time": "16m 5s", "remaining_time": "1h 3m 33s"} +{"loss": 0.42580986, "token_acc": 0.83790817, "grad_norm": 0.29189751, "learning_rate": 9.344e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204959, "epoch": 1.03535354, "global_step/max_steps": "205/990", "percentage": "20.71%", "elapsed_time": "16m 39s", "remaining_time": "1h 3m 48s"} +{"loss": 0.35763757, "token_acc": 0.87683032, "grad_norm": 0.47305414, "learning_rate": 9.302e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204647, "epoch": 1.06060606, "global_step/max_steps": "210/990", "percentage": "21.21%", "elapsed_time": "17m 5s", "remaining_time": "1h 3m 30s"} +{"loss": 0.33073678, "token_acc": 0.8893819, "grad_norm": 0.39628196, "learning_rate": 9.259e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203729, "epoch": 1.08585859, "global_step/max_steps": "215/990", "percentage": "21.72%", "elapsed_time": "17m 34s", "remaining_time": "1h 3m 22s"} +{"loss": 0.29806085, "token_acc": 0.90179606, "grad_norm": 0.55148607, "learning_rate": 9.214e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204164, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "17m 57s", "remaining_time": "1h 2m 50s"} +{"eval_loss": 0.46758109, "eval_token_acc": 0.74925075, "eval_runtime": 1.7981, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 1.11111111, "global_step/max_steps": "220/990", "percentage": "22.22%", "elapsed_time": "17m 58s", "remaining_time": "1h 2m 56s"} +{"loss": 0.42431865, "token_acc": 0.84602936, "grad_norm": 0.53113592, "learning_rate": 9.169e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203358, "epoch": 1.13636364, "global_step/max_steps": "225/990", "percentage": "22.73%", "elapsed_time": "18m 26s", "remaining_time": "1h 2m 40s"} +{"loss": 0.36201489, "token_acc": 0.8752454, "grad_norm": 0.39372367, "learning_rate": 9.122e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.202013, "epoch": 1.16161616, "global_step/max_steps": "230/990", "percentage": "23.23%", "elapsed_time": "18m 58s", "remaining_time": "1h 2m 40s"} +{"loss": 0.15755239, "token_acc": 0.92670425, "grad_norm": 0.48343992, "learning_rate": 9.074e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203514, "epoch": 1.18686869, "global_step/max_steps": "235/990", "percentage": "23.74%", "elapsed_time": "19m 14s", "remaining_time": "1h 1m 48s"} +{"loss": 0.38658602, "token_acc": 0.87145434, "grad_norm": 0.74133027, "learning_rate": 9.025e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204106, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "19m 35s", "remaining_time": "1h 1m 13s"} +{"eval_loss": 0.51413596, "eval_token_acc": 0.74325674, "eval_runtime": 1.7909, "eval_samples_per_second": 2.233, "eval_steps_per_second": 2.233, "epoch": 1.21212121, "global_step/max_steps": "240/990", "percentage": "24.24%", "elapsed_time": "19m 37s", "remaining_time": "1h 1m 18s"} +{"loss": 0.24457972, "token_acc": 0.86328319, "grad_norm": 0.497913, "learning_rate": 8.975e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204329, "epoch": 1.23737374, "global_step/max_steps": "245/990", "percentage": "24.75%", "elapsed_time": "19m 58s", "remaining_time": "1h 0m 44s"} +{"loss": 0.38393686, "token_acc": 0.86459489, "grad_norm": 0.77854681, "learning_rate": 8.924e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205736, "epoch": 1.26262626, "global_step/max_steps": "250/990", "percentage": "25.25%", "elapsed_time": "20m 14s", "remaining_time": "59m 55s"} +{"loss": 0.3447341, "token_acc": 0.86901225, "grad_norm": 0.42964199, "learning_rate": 8.872e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204547, "epoch": 1.28787879, "global_step/max_steps": "255/990", "percentage": "25.76%", "elapsed_time": "20m 46s", "remaining_time": "59m 52s"} +{"loss": 0.32251389, "token_acc": 0.87506489, "grad_norm": 0.69879895, "learning_rate": 8.818e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.2052, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "21m 6s", "remaining_time": "59m 16s"} +{"eval_loss": 0.45786056, "eval_token_acc": 0.76023976, "eval_runtime": 1.7868, "eval_samples_per_second": 2.239, "eval_steps_per_second": 2.239, "epoch": 1.31313131, "global_step/max_steps": "260/990", "percentage": "26.26%", "elapsed_time": "21m 8s", "remaining_time": "59m 21s"} +{"loss": 0.25296369, "token_acc": 0.89998286, "grad_norm": 0.22191784, "learning_rate": 8.764e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203055, "epoch": 1.33838384, "global_step/max_steps": "265/990", "percentage": "26.77%", "elapsed_time": "21m 44s", "remaining_time": "59m 29s"} +{"loss": 0.23594444, "token_acc": 0.90832762, "grad_norm": 0.43771964, "learning_rate": 8.708e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20368, "epoch": 1.36363636, "global_step/max_steps": "270/990", "percentage": "27.27%", "elapsed_time": "22m 5s", "remaining_time": "58m 53s"} +{"loss": 0.25529866, "token_acc": 0.90525176, "grad_norm": 0.38278905, "learning_rate": 8.652e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203283, "epoch": 1.38888889, "global_step/max_steps": "275/990", "percentage": "27.78%", "elapsed_time": "22m 32s", "remaining_time": "58m 36s"} +{"loss": 0.3728013, "token_acc": 0.88516493, "grad_norm": 0.83121717, "learning_rate": 8.594e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203978, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "22m 52s", "remaining_time": "57m 59s"} +{"eval_loss": 0.42085442, "eval_token_acc": 0.75924076, "eval_runtime": 1.7793, "eval_samples_per_second": 2.248, "eval_steps_per_second": 2.248, "epoch": 1.41414141, "global_step/max_steps": "280/990", "percentage": "28.28%", "elapsed_time": "22m 54s", "remaining_time": "58m 4s"} +{"loss": 0.1747979, "token_acc": 0.90636379, "grad_norm": 0.74464381, "learning_rate": 8.536e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20415, "epoch": 1.43939394, "global_step/max_steps": "285/990", "percentage": "28.79%", "elapsed_time": "23m 15s", "remaining_time": "57m 32s"} +{"loss": 0.33746436, "token_acc": 0.86484785, "grad_norm": 0.79865491, "learning_rate": 8.476e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205059, "epoch": 1.46464646, "global_step/max_steps": "290/990", "percentage": "29.29%", "elapsed_time": "23m 33s", "remaining_time": "56m 52s"} +{"loss": 0.35075483, "token_acc": 0.84849193, "grad_norm": 0.45301414, "learning_rate": 8.415e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206092, "epoch": 1.48989899, "global_step/max_steps": "295/990", "percentage": "29.80%", "elapsed_time": "23m 51s", "remaining_time": "56m 11s"} +{"loss": 0.27676065, "token_acc": 0.88765706, "grad_norm": 1.57653129, "learning_rate": 8.354e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206251, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "24m 14s", "remaining_time": "55m 44s"} +{"eval_loss": 0.41149631, "eval_token_acc": 0.75424575, "eval_runtime": 1.7835, "eval_samples_per_second": 2.243, "eval_steps_per_second": 2.243, "epoch": 1.51515152, "global_step/max_steps": "300/990", "percentage": "30.30%", "elapsed_time": "24m 15s", "remaining_time": "55m 48s"} +{"loss": 0.33141241, "token_acc": 0.87466428, "grad_norm": 0.54671615, "learning_rate": 8.291e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205011, "epoch": 1.54040404, "global_step/max_steps": "305/990", "percentage": "30.81%", "elapsed_time": "24m 47s", "remaining_time": "55m 40s"} +{"loss": 0.26014681, "token_acc": 0.91019037, "grad_norm": 0.51829976, "learning_rate": 8.228e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205931, "epoch": 1.56565657, "global_step/max_steps": "310/990", "percentage": "31.31%", "elapsed_time": "25m 4s", "remaining_time": "55m 1s"} +{"loss": 0.22466779, "token_acc": 0.90713166, "grad_norm": 1.43135619, "learning_rate": 8.164e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206665, "epoch": 1.59090909, "global_step/max_steps": "315/990", "percentage": "31.82%", "elapsed_time": "25m 23s", "remaining_time": "54m 25s"} +{"loss": 0.29714296, "token_acc": 0.90940118, "grad_norm": 0.48487395, "learning_rate": 8.099e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206913, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "25m 46s", "remaining_time": "53m 57s"} +{"eval_loss": 0.34123558, "eval_token_acc": 0.75624376, "eval_runtime": 1.7732, "eval_samples_per_second": 2.256, "eval_steps_per_second": 2.256, "epoch": 1.61616162, "global_step/max_steps": "320/990", "percentage": "32.32%", "elapsed_time": "25m 47s", "remaining_time": "54m 0s"} +{"loss": 0.30359759, "token_acc": 0.86476628, "grad_norm": 0.52747029, "learning_rate": 8.033e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206592, "epoch": 1.64141414, "global_step/max_steps": "325/990", "percentage": "32.83%", "elapsed_time": "26m 12s", "remaining_time": "53m 38s"} +{"loss": 0.36988287, "token_acc": 0.8955608, "grad_norm": 0.71160477, "learning_rate": 7.966e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207439, "epoch": 1.66666667, "global_step/max_steps": "330/990", "percentage": "33.33%", "elapsed_time": "26m 30s", "remaining_time": "53m 0s"} +{"loss": 0.2918565, "token_acc": 0.89257482, "grad_norm": 0.4688921, "learning_rate": 7.898e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207421, "epoch": 1.69191919, "global_step/max_steps": "335/990", "percentage": "33.84%", "elapsed_time": "26m 54s", "remaining_time": "52m 37s"} +{"loss": 0.20344338, "token_acc": 0.92035199, "grad_norm": 0.97644758, "learning_rate": 7.83e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208201, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "27m 12s", "remaining_time": "52m 1s"} +{"eval_loss": 0.34089023, "eval_token_acc": 0.75624376, "eval_runtime": 1.8007, "eval_samples_per_second": 2.221, "eval_steps_per_second": 2.221, "epoch": 1.71717172, "global_step/max_steps": "340/990", "percentage": "34.34%", "elapsed_time": "27m 14s", "remaining_time": "52m 4s"} +{"loss": 0.24397383, "token_acc": 0.90525745, "grad_norm": 0.52789664, "learning_rate": 7.76e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207523, "epoch": 1.74242424, "global_step/max_steps": "345/990", "percentage": "34.85%", "elapsed_time": "27m 42s", "remaining_time": "51m 47s"} +{"loss": 0.18279754, "token_acc": 0.93266769, "grad_norm": 1.44505608, "learning_rate": 7.69e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208339, "epoch": 1.76767677, "global_step/max_steps": "350/990", "percentage": "35.35%", "elapsed_time": "27m 59s", "remaining_time": "51m 11s"} +{"loss": 0.36357427, "token_acc": 0.88109798, "grad_norm": 0.90072274, "learning_rate": 7.62e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208407, "epoch": 1.79292929, "global_step/max_steps": "355/990", "percentage": "35.86%", "elapsed_time": "28m 23s", "remaining_time": "50m 46s"} +{"loss": 0.22396004, "token_acc": 0.90859209, "grad_norm": 0.33272004, "learning_rate": 7.548e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208397, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "28m 47s", "remaining_time": "50m 22s"} +{"eval_loss": 0.37214178, "eval_token_acc": 0.74825175, "eval_runtime": 1.8042, "eval_samples_per_second": 2.217, "eval_steps_per_second": 2.217, "epoch": 1.81818182, "global_step/max_steps": "360/990", "percentage": "36.36%", "elapsed_time": "28m 48s", "remaining_time": "50m 25s"} +{"loss": 0.29300323, "token_acc": 0.8698196, "grad_norm": 0.8981424, "learning_rate": 7.476e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208386, "epoch": 1.84343434, "global_step/max_steps": "365/990", "percentage": "36.87%", "elapsed_time": "29m 11s", "remaining_time": "49m 58s"} +{"loss": 0.30235629, "token_acc": 0.88782664, "grad_norm": 0.37038329, "learning_rate": 7.403e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208339, "epoch": 1.86868687, "global_step/max_steps": "370/990", "percentage": "37.37%", "elapsed_time": "29m 35s", "remaining_time": "49m 35s"} +{"loss": 0.35419927, "token_acc": 0.88196567, "grad_norm": 0.50516832, "learning_rate": 7.329e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208034, "epoch": 1.89393939, "global_step/max_steps": "375/990", "percentage": "37.88%", "elapsed_time": "30m 2s", "remaining_time": "49m 15s"} +{"loss": 0.31901164, "token_acc": 0.89629321, "grad_norm": 0.71648002, "learning_rate": 7.255e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208147, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "30m 25s", "remaining_time": "48m 50s"} +{"eval_loss": 0.31777072, "eval_token_acc": 0.76323676, "eval_runtime": 1.8015, "eval_samples_per_second": 2.22, "eval_steps_per_second": 2.22, "epoch": 1.91919192, "global_step/max_steps": "380/990", "percentage": "38.38%", "elapsed_time": "30m 27s", "remaining_time": "48m 52s"} +{"loss": 0.33027997, "token_acc": 0.88464506, "grad_norm": 0.24409626, "learning_rate": 7.18e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207591, "epoch": 1.94444444, "global_step/max_steps": "385/990", "percentage": "38.89%", "elapsed_time": "30m 54s", "remaining_time": "48m 33s"} +{"loss": 0.23348629, "token_acc": 0.90389989, "grad_norm": 0.41487977, "learning_rate": 7.105e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207441, "epoch": 1.96969697, "global_step/max_steps": "390/990", "percentage": "39.39%", "elapsed_time": "31m 19s", "remaining_time": "48m 11s"} +{"loss": 0.19255179, "token_acc": 0.91864859, "grad_norm": 0.39935234, "learning_rate": 7.029e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208111, "epoch": 1.99494949, "global_step/max_steps": "395/990", "percentage": "39.90%", "elapsed_time": "31m 37s", "remaining_time": "47m 38s"} +{"loss": 0.17468176, "token_acc": 0.95617402, "grad_norm": 0.38700497, "learning_rate": 6.952e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209087, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "31m 52s", "remaining_time": "47m 1s"} +{"eval_loss": 0.2884281, "eval_token_acc": 0.76123876, "eval_runtime": 1.7921, "eval_samples_per_second": 2.232, "eval_steps_per_second": 2.232, "epoch": 2.02020202, "global_step/max_steps": "400/990", "percentage": "40.40%", "elapsed_time": "31m 54s", "remaining_time": "47m 3s"} +{"loss": 0.12435632, "token_acc": 0.92033945, "grad_norm": 0.85902083, "learning_rate": 6.875e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208801, "epoch": 2.04545455, "global_step/max_steps": "405/990", "percentage": "40.91%", "elapsed_time": "32m 19s", "remaining_time": "46m 41s"} +{"loss": 0.12814103, "token_acc": 0.93287534, "grad_norm": 1.18803513, "learning_rate": 6.797e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208916, "epoch": 2.07070707, "global_step/max_steps": "410/990", "percentage": "41.41%", "elapsed_time": "32m 42s", "remaining_time": "46m 15s"} +{"loss": 0.13511013, "token_acc": 0.94182459, "grad_norm": 0.56210369, "learning_rate": 6.719e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20947, "epoch": 2.0959596, "global_step/max_steps": "415/990", "percentage": "41.92%", "elapsed_time": "33m 0s", "remaining_time": "45m 44s"} +{"loss": 0.08711874, "token_acc": 0.98264171, "grad_norm": 0.33909729, "learning_rate": 6.64e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.21021, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "33m 17s", "remaining_time": "45m 11s"} +{"eval_loss": 0.2877776, "eval_token_acc": 0.76123876, "eval_runtime": 1.7912, "eval_samples_per_second": 2.233, "eval_steps_per_second": 2.233, "epoch": 2.12121212, "global_step/max_steps": "420/990", "percentage": "42.42%", "elapsed_time": "33m 19s", "remaining_time": "45m 13s"} +{"loss": 0.14229174, "token_acc": 0.9111276, "grad_norm": 0.6231221, "learning_rate": 6.561e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210123, "epoch": 2.14646465, "global_step/max_steps": "425/990", "percentage": "42.93%", "elapsed_time": "33m 42s", "remaining_time": "44m 48s"} +{"loss": 0.09396475, "token_acc": 0.97177869, "grad_norm": 0.29918316, "learning_rate": 6.481e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209878, "epoch": 2.17171717, "global_step/max_steps": "430/990", "percentage": "43.43%", "elapsed_time": "34m 8s", "remaining_time": "44m 27s"} +{"loss": 0.11964769, "token_acc": 0.96683957, "grad_norm": 0.2828837, "learning_rate": 6.401e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209161, "epoch": 2.1969697, "global_step/max_steps": "435/990", "percentage": "43.94%", "elapsed_time": "34m 39s", "remaining_time": "44m 12s"} +{"loss": 0.13254006, "token_acc": 0.95438828, "grad_norm": 0.81782269, "learning_rate": 6.321e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209483, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "35m 0s", "remaining_time": "43m 45s"} +{"eval_loss": 0.30088347, "eval_token_acc": 0.75624376, "eval_runtime": 1.7791, "eval_samples_per_second": 2.248, "eval_steps_per_second": 2.248, "epoch": 2.22222222, "global_step/max_steps": "440/990", "percentage": "44.44%", "elapsed_time": "35m 1s", "remaining_time": "43m 47s"} +{"loss": 0.05500662, "token_acc": 0.95050778, "grad_norm": 0.43101543, "learning_rate": 6.24e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209114, "epoch": 2.24747475, "global_step/max_steps": "445/990", "percentage": "44.95%", "elapsed_time": "35m 27s", "remaining_time": "43m 25s"} +{"loss": 0.19470509, "token_acc": 0.93248312, "grad_norm": 0.88504606, "learning_rate": 6.159e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209501, "epoch": 2.27272727, "global_step/max_steps": "450/990", "percentage": "45.45%", "elapsed_time": "35m 47s", "remaining_time": "42m 57s"} +{"loss": 0.09629551, "token_acc": 0.96713935, "grad_norm": 0.40710202, "learning_rate": 6.078e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209586, "epoch": 2.2979798, "global_step/max_steps": "455/990", "percentage": "45.96%", "elapsed_time": "36m 10s", "remaining_time": "42m 32s"} +{"loss": 0.11653724, "token_acc": 0.96252591, "grad_norm": 0.41429275, "learning_rate": 5.996e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209806, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "36m 32s", "remaining_time": "42m 5s"} +{"eval_loss": 0.3004967, "eval_token_acc": 0.75324675, "eval_runtime": 1.7662, "eval_samples_per_second": 2.265, "eval_steps_per_second": 2.265, "epoch": 2.32323232, "global_step/max_steps": "460/990", "percentage": "46.46%", "elapsed_time": "36m 33s", "remaining_time": "42m 7s"} +{"loss": 0.12927284, "token_acc": 0.90990991, "grad_norm": 0.53362256, "learning_rate": 5.914e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210062, "epoch": 2.34848485, "global_step/max_steps": "465/990", "percentage": "46.97%", "elapsed_time": "36m 53s", "remaining_time": "41m 38s"} +{"loss": 0.08137755, "token_acc": 0.96566231, "grad_norm": 0.69000143, "learning_rate": 5.832e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209705, "epoch": 2.37373737, "global_step/max_steps": "470/990", "percentage": "47.47%", "elapsed_time": "37m 20s", "remaining_time": "41m 19s"} +{"loss": 0.10649137, "token_acc": 0.95983327, "grad_norm": 0.87968314, "learning_rate": 5.749e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210106, "epoch": 2.3989899, "global_step/max_steps": "475/990", "percentage": "47.98%", "elapsed_time": "37m 40s", "remaining_time": "40m 50s"} +{"loss": 0.17189999, "token_acc": 0.94043306, "grad_norm": 0.3351011, "learning_rate": 5.666e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209154, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "38m 14s", "remaining_time": "40m 37s"} +{"eval_loss": 0.30119926, "eval_token_acc": 0.75824176, "eval_runtime": 1.7649, "eval_samples_per_second": 2.266, "eval_steps_per_second": 2.266, "epoch": 2.42424242, "global_step/max_steps": "480/990", "percentage": "48.48%", "elapsed_time": "38m 16s", "remaining_time": "40m 39s"} +{"loss": 0.09826596, "token_acc": 0.91701928, "grad_norm": 0.72007924, "learning_rate": 5.584e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209559, "epoch": 2.44949495, "global_step/max_steps": "485/990", "percentage": "48.99%", "elapsed_time": "38m 33s", "remaining_time": "40m 9s"} +{"loss": 0.19064462, "token_acc": 0.92577598, "grad_norm": 0.58847201, "learning_rate": 5.5e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209499, "epoch": 2.47474747, "global_step/max_steps": "490/990", "percentage": "49.49%", "elapsed_time": "38m 58s", "remaining_time": "39m 46s"} +{"loss": 0.10975324, "token_acc": 0.95879523, "grad_norm": 0.33910003, "learning_rate": 5.417e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209984, "epoch": 2.5, "global_step/max_steps": "495/990", "percentage": "50.00%", "elapsed_time": "39m 16s", "remaining_time": "39m 16s"} +{"loss": 0.17116069, "token_acc": 0.91639009, "grad_norm": 0.53634661, "learning_rate": 5.334e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209844, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "39m 42s", "remaining_time": "38m 54s"} +{"eval_loss": 0.29063457, "eval_token_acc": 0.75224775, "eval_runtime": 1.8092, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.52525253, "global_step/max_steps": "500/990", "percentage": "50.51%", "elapsed_time": "39m 44s", "remaining_time": "38m 56s"} +{"loss": 0.0904828, "token_acc": 0.92896094, "grad_norm": 0.60323656, "learning_rate": 5.251e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209717, "epoch": 2.55050505, "global_step/max_steps": "505/990", "percentage": "51.01%", "elapsed_time": "40m 7s", "remaining_time": "38m 32s"} +{"loss": 0.09625996, "token_acc": 0.96649485, "grad_norm": 0.62781221, "learning_rate": 5.167e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210284, "epoch": 2.57575758, "global_step/max_steps": "510/990", "percentage": "51.52%", "elapsed_time": "40m 24s", "remaining_time": "38m 2s"} +{"loss": 0.13246565, "token_acc": 0.95012146, "grad_norm": 0.38628754, "learning_rate": 5.084e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210549, "epoch": 2.6010101, "global_step/max_steps": "515/990", "percentage": "52.02%", "elapsed_time": "40m 45s", "remaining_time": "37m 35s"} +{"loss": 0.13336525, "token_acc": 0.94273366, "grad_norm": 0.46550637, "learning_rate": 5e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210504, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "41m 9s", "remaining_time": "37m 12s"} +{"eval_loss": 0.29845107, "eval_token_acc": 0.75024975, "eval_runtime": 1.8075, "eval_samples_per_second": 2.213, "eval_steps_per_second": 2.213, "epoch": 2.62626263, "global_step/max_steps": "520/990", "percentage": "52.53%", "elapsed_time": "41m 11s", "remaining_time": "37m 14s"} +{"loss": 0.09229624, "token_acc": 0.9316996, "grad_norm": 0.45421982, "learning_rate": 4.916e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210556, "epoch": 2.65151515, "global_step/max_steps": "525/990", "percentage": "53.03%", "elapsed_time": "41m 33s", "remaining_time": "36m 48s"} +{"loss": 0.16157912, "token_acc": 0.94532267, "grad_norm": 0.38779676, "learning_rate": 4.833e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209926, "epoch": 2.67676768, "global_step/max_steps": "530/990", "percentage": "53.54%", "elapsed_time": "42m 4s", "remaining_time": "36m 30s"} +{"loss": 0.09830495, "token_acc": 0.96677373, "grad_norm": 0.75503731, "learning_rate": 4.749e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209454, "epoch": 2.7020202, "global_step/max_steps": "535/990", "percentage": "54.04%", "elapsed_time": "42m 33s", "remaining_time": "36m 11s"} +{"loss": 0.21627908, "token_acc": 0.918975, "grad_norm": 0.8958115, "learning_rate": 4.666e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20902, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "43m 3s", "remaining_time": "35m 52s"} +{"eval_loss": 0.29240885, "eval_token_acc": 0.75824176, "eval_runtime": 1.8087, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.72727273, "global_step/max_steps": "540/990", "percentage": "54.55%", "elapsed_time": "43m 4s", "remaining_time": "35m 54s"} +{"loss": 0.15035768, "token_acc": 0.91052137, "grad_norm": 0.62307805, "learning_rate": 4.583e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208676, "epoch": 2.75252525, "global_step/max_steps": "545/990", "percentage": "55.05%", "elapsed_time": "43m 31s", "remaining_time": "35m 32s"} +{"loss": 0.1066666, "token_acc": 0.96255371, "grad_norm": 0.27316108, "learning_rate": 4.5e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20872, "epoch": 2.77777778, "global_step/max_steps": "550/990", "percentage": "55.56%", "elapsed_time": "43m 54s", "remaining_time": "35m 7s"} +{"loss": 0.09249015, "token_acc": 0.96887581, "grad_norm": 0.50839013, "learning_rate": 4.416e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208968, "epoch": 2.8030303, "global_step/max_steps": "555/990", "percentage": "56.06%", "elapsed_time": "44m 15s", "remaining_time": "34m 41s"} +{"loss": 0.1238274, "token_acc": 0.94271406, "grad_norm": 0.3543936, "learning_rate": 4.334e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208825, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "44m 41s", "remaining_time": "34m 18s"} +{"eval_loss": 0.28748947, "eval_token_acc": 0.75424575, "eval_runtime": 1.8169, "eval_samples_per_second": 2.202, "eval_steps_per_second": 2.202, "epoch": 2.82828283, "global_step/max_steps": "560/990", "percentage": "56.57%", "elapsed_time": "44m 43s", "remaining_time": "34m 20s"} +{"loss": 0.14281988, "token_acc": 0.92157282, "grad_norm": 0.68178886, "learning_rate": 4.251e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208389, "epoch": 2.85353535, "global_step/max_steps": "565/990", "percentage": "57.07%", "elapsed_time": "45m 10s", "remaining_time": "33m 59s"} +{"loss": 0.19629953, "token_acc": 0.92469512, "grad_norm": 0.64119381, "learning_rate": 4.168e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208033, "epoch": 2.87878788, "global_step/max_steps": "570/990", "percentage": "57.58%", "elapsed_time": "45m 39s", "remaining_time": "33m 38s"} +{"loss": 0.10608698, "token_acc": 0.95675676, "grad_norm": 0.7665056, "learning_rate": 4.086e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207971, "epoch": 2.9040404, "global_step/max_steps": "575/990", "percentage": "58.08%", "elapsed_time": "46m 4s", "remaining_time": "33m 15s"} +{"loss": 0.14608672, "token_acc": 0.93796918, "grad_norm": 2.5615561, "learning_rate": 4.004e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208359, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "46m 23s", "remaining_time": "32m 47s"} +{"eval_loss": 0.29311869, "eval_token_acc": 0.75324675, "eval_runtime": 1.8095, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.92929293, "global_step/max_steps": "580/990", "percentage": "58.59%", "elapsed_time": "46m 25s", "remaining_time": "32m 48s"} +{"loss": 0.14551492, "token_acc": 0.90744065, "grad_norm": 0.75626624, "learning_rate": 3.922e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208553, "epoch": 2.95454545, "global_step/max_steps": "585/990", "percentage": "59.09%", "elapsed_time": "46m 44s", "remaining_time": "32m 21s"} +{"loss": 0.14465657, "token_acc": 0.95077582, "grad_norm": 0.73589957, "learning_rate": 3.841e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208115, "epoch": 2.97979798, "global_step/max_steps": "590/990", "percentage": "59.60%", "elapsed_time": "47m 14s", "remaining_time": "32m 1s"} +{"loss": 0.12801223, "token_acc": 0.9628496, "grad_norm": 0.22420852, "learning_rate": 3.76e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207756, "epoch": 3.00505051, "global_step/max_steps": "595/990", "percentage": "60.10%", "elapsed_time": "47m 43s", "remaining_time": "31m 41s"} +{"loss": 0.05944471, "token_acc": 0.98197133, "grad_norm": 0.56523925, "learning_rate": 3.679e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207893, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "48m 5s", "remaining_time": "31m 15s"} +{"eval_loss": 0.30186352, "eval_token_acc": 0.75724276, "eval_runtime": 1.8433, "eval_samples_per_second": 2.17, "eval_steps_per_second": 2.17, "epoch": 3.03030303, "global_step/max_steps": "600/990", "percentage": "60.61%", "elapsed_time": "48m 7s", "remaining_time": "31m 16s"} +{"loss": 0.04340479, "token_acc": 0.95780302, "grad_norm": 0.63307106, "learning_rate": 3.599e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207185, "epoch": 3.05555556, "global_step/max_steps": "605/990", "percentage": "61.11%", "elapsed_time": "48m 39s", "remaining_time": "30m 57s"} +{"loss": 0.01914702, "token_acc": 0.9906428, "grad_norm": 0.29284203, "learning_rate": 3.519e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207564, "epoch": 3.08080808, "global_step/max_steps": "610/990", "percentage": "61.62%", "elapsed_time": "48m 58s", "remaining_time": "30m 30s"} +{"loss": 0.01184889, "token_acc": 0.99504281, "grad_norm": 0.22657505, "learning_rate": 3.439e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207984, "epoch": 3.10606061, "global_step/max_steps": "615/990", "percentage": "62.12%", "elapsed_time": "49m 16s", "remaining_time": "30m 2s"} +{"loss": 0.03439114, "token_acc": 0.98856949, "grad_norm": 0.38536501, "learning_rate": 3.36e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207846, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "49m 42s", "remaining_time": "29m 39s"} +{"eval_loss": 0.30927947, "eval_token_acc": 0.74825175, "eval_runtime": 1.8078, "eval_samples_per_second": 2.213, "eval_steps_per_second": 2.213, "epoch": 3.13131313, "global_step/max_steps": "620/990", "percentage": "62.63%", "elapsed_time": "49m 44s", "remaining_time": "29m 41s"} +{"loss": 0.03383541, "token_acc": 0.9644806, "grad_norm": 0.43870378, "learning_rate": 3.281e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207235, "epoch": 3.15656566, "global_step/max_steps": "625/990", "percentage": "63.13%", "elapsed_time": "50m 15s", "remaining_time": "29m 21s"} +{"loss": 0.04778254, "token_acc": 0.98598326, "grad_norm": 0.50692093, "learning_rate": 3.203e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207611, "epoch": 3.18181818, "global_step/max_steps": "630/990", "percentage": "63.64%", "elapsed_time": "50m 34s", "remaining_time": "28m 53s"} +{"loss": 0.06957238, "token_acc": 0.96983151, "grad_norm": 0.74996704, "learning_rate": 3.125e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207401, "epoch": 3.20707071, "global_step/max_steps": "635/990", "percentage": "64.14%", "elapsed_time": "51m 1s", "remaining_time": "28m 31s"} +{"loss": 0.06834298, "token_acc": 0.96775769, "grad_norm": 0.40983146, "learning_rate": 3.048e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206982, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "51m 31s", "remaining_time": "28m 10s"} +{"eval_loss": 0.30714756, "eval_token_acc": 0.74925075, "eval_runtime": 1.8289, "eval_samples_per_second": 2.187, "eval_steps_per_second": 2.187, "epoch": 3.23232323, "global_step/max_steps": "640/990", "percentage": "64.65%", "elapsed_time": "51m 33s", "remaining_time": "28m 11s"} +{"loss": 0.05352731, "token_acc": 0.95557732, "grad_norm": 0.66573399, "learning_rate": 2.971e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206307, "epoch": 3.25757576, "global_step/max_steps": "645/990", "percentage": "65.15%", "elapsed_time": "52m 6s", "remaining_time": "27m 52s"} +{"loss": 0.05851475, "token_acc": 0.9809718, "grad_norm": 0.35414842, "learning_rate": 2.895e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206557, "epoch": 3.28282828, "global_step/max_steps": "650/990", "percentage": "65.66%", "elapsed_time": "52m 26s", "remaining_time": "27m 25s"} +{"loss": 0.02573989, "token_acc": 0.99375195, "grad_norm": 0.41236743, "learning_rate": 2.82e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206671, "epoch": 3.30808081, "global_step/max_steps": "655/990", "percentage": "66.16%", "elapsed_time": "52m 48s", "remaining_time": "27m 0s"} +{"loss": 0.04316897, "token_acc": 0.98503563, "grad_norm": 0.51062256, "learning_rate": 2.745e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206584, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "53m 14s", "remaining_time": "26m 37s"} +{"eval_loss": 0.30926383, "eval_token_acc": 0.74625375, "eval_runtime": 1.7766, "eval_samples_per_second": 2.251, "eval_steps_per_second": 2.251, "epoch": 3.33333333, "global_step/max_steps": "660/990", "percentage": "66.67%", "elapsed_time": "53m 16s", "remaining_time": "26m 38s"} +{"loss": 0.0448059, "token_acc": 0.93844101, "grad_norm": 0.75110435, "learning_rate": 2.671e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206749, "epoch": 3.35858586, "global_step/max_steps": "665/990", "percentage": "67.17%", "elapsed_time": "53m 36s", "remaining_time": "26m 11s"} +{"loss": 0.04475101, "token_acc": 0.98166877, "grad_norm": 0.70611215, "learning_rate": 2.597e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206885, "epoch": 3.38383838, "global_step/max_steps": "670/990", "percentage": "67.68%", "elapsed_time": "53m 58s", "remaining_time": "25m 46s"} +{"loss": 0.03667647, "token_acc": 0.98367848, "grad_norm": 0.35980213, "learning_rate": 2.524e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207096, "epoch": 3.40909091, "global_step/max_steps": "675/990", "percentage": "68.18%", "elapsed_time": "54m 18s", "remaining_time": "25m 20s"} +{"loss": 0.05962592, "token_acc": 0.97330408, "grad_norm": 0.49686399, "learning_rate": 2.452e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207419, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "54m 38s", "remaining_time": "24m 54s"} +{"eval_loss": 0.30547747, "eval_token_acc": 0.74225774, "eval_runtime": 1.7648, "eval_samples_per_second": 2.267, "eval_steps_per_second": 2.267, "epoch": 3.43434343, "global_step/max_steps": "680/990", "percentage": "68.69%", "elapsed_time": "54m 39s", "remaining_time": "24m 55s"} +{"loss": 0.07604837, "token_acc": 0.93169278, "grad_norm": 0.36548638, "learning_rate": 2.38e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207066, "epoch": 3.45959596, "global_step/max_steps": "685/990", "percentage": "69.19%", "elapsed_time": "55m 7s", "remaining_time": "24m 32s"} +{"loss": 0.05938977, "token_acc": 0.97271157, "grad_norm": 0.95408553, "learning_rate": 2.31e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206879, "epoch": 3.48484848, "global_step/max_steps": "690/990", "percentage": "69.70%", "elapsed_time": "55m 34s", "remaining_time": "24m 9s"} +{"loss": 0.05741436, "token_acc": 0.98016781, "grad_norm": 0.49375463, "learning_rate": 2.24e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207044, "epoch": 3.51010101, "global_step/max_steps": "695/990", "percentage": "70.20%", "elapsed_time": "55m 56s", "remaining_time": "23m 44s"} +{"loss": 0.01394528, "token_acc": 0.99569454, "grad_norm": 0.1865593, "learning_rate": 2.17e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207418, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "56m 14s", "remaining_time": "23m 17s"} +{"eval_loss": 0.30903569, "eval_token_acc": 0.74325674, "eval_runtime": 1.7685, "eval_samples_per_second": 2.262, "eval_steps_per_second": 2.262, "epoch": 3.53535354, "global_step/max_steps": "700/990", "percentage": "70.71%", "elapsed_time": "56m 16s", "remaining_time": "23m 18s"} +{"loss": 0.03774779, "token_acc": 0.9390336, "grad_norm": 0.28193864, "learning_rate": 2.102e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207569, "epoch": 3.56060606, "global_step/max_steps": "705/990", "percentage": "71.21%", "elapsed_time": "56m 36s", "remaining_time": "22m 52s"} +{"loss": 0.03159323, "token_acc": 0.98888772, "grad_norm": 0.43836522, "learning_rate": 2.034e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207322, "epoch": 3.58585859, "global_step/max_steps": "710/990", "percentage": "71.72%", "elapsed_time": "57m 4s", "remaining_time": "22m 30s"} +{"loss": 0.00255849, "token_acc": 0.99913006, "grad_norm": 0.6935764, "learning_rate": 1.967e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.20803, "epoch": 3.61111111, "global_step/max_steps": "715/990", "percentage": "72.22%", "elapsed_time": "57m 16s", "remaining_time": "22m 1s"} +{"loss": 0.01584032, "token_acc": 0.99573156, "grad_norm": 0.71256214, "learning_rate": 1.901e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207895, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "57m 42s", "remaining_time": "21m 38s"} +{"eval_loss": 0.31770468, "eval_token_acc": 0.74325674, "eval_runtime": 1.7785, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 3.63636364, "global_step/max_steps": "720/990", "percentage": "72.73%", "elapsed_time": "57m 44s", "remaining_time": "21m 39s"} +{"loss": 0.03762939, "token_acc": 0.95534347, "grad_norm": 0.46797064, "learning_rate": 1.836e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207656, "epoch": 3.66161616, "global_step/max_steps": "725/990", "percentage": "73.23%", "elapsed_time": "58m 10s", "remaining_time": "21m 16s"} +{"loss": 0.05497429, "token_acc": 0.9796425, "grad_norm": 0.31816274, "learning_rate": 1.772e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207349, "epoch": 3.68686869, "global_step/max_steps": "730/990", "percentage": "73.74%", "elapsed_time": "58m 40s", "remaining_time": "20m 53s"} +{"loss": 0.03574101, "token_acc": 0.98391794, "grad_norm": 0.39590964, "learning_rate": 1.709e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207241, "epoch": 3.71212121, "global_step/max_steps": "735/990", "percentage": "74.24%", "elapsed_time": "59m 6s", "remaining_time": "20m 30s"} +{"loss": 0.04509756, "token_acc": 0.98701299, "grad_norm": 0.61121833, "learning_rate": 1.646e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207546, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "59m 25s", "remaining_time": "20m 4s"} +{"eval_loss": 0.3148863, "eval_token_acc": 0.74825175, "eval_runtime": 1.8107, "eval_samples_per_second": 2.209, "eval_steps_per_second": 2.209, "epoch": 3.73737374, "global_step/max_steps": "740/990", "percentage": "74.75%", "elapsed_time": "59m 26s", "remaining_time": "20m 5s"} +{"loss": 0.03688705, "token_acc": 0.91727891, "grad_norm": 0.28851324, "learning_rate": 1.585e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207875, "epoch": 3.76262626, "global_step/max_steps": "745/990", "percentage": "75.25%", "elapsed_time": "59m 43s", "remaining_time": "19m 38s"} +{"loss": 0.03432915, "token_acc": 0.98982008, "grad_norm": 0.71633101, "learning_rate": 1.524e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208273, "epoch": 3.78787879, "global_step/max_steps": "750/990", "percentage": "75.76%", "elapsed_time": "1h 0m 0s", "remaining_time": "19m 12s"} +{"loss": 0.11100113, "token_acc": 0.95342936, "grad_norm": 0.49357241, "learning_rate": 1.464e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207493, "epoch": 3.81313131, "global_step/max_steps": "755/990", "percentage": "76.26%", "elapsed_time": "1h 0m 38s", "remaining_time": "18m 52s"} +{"loss": 0.01180831, "token_acc": 0.99599145, "grad_norm": 0.60982263, "learning_rate": 1.406e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208006, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "1h 0m 53s", "remaining_time": "18m 25s"} +{"eval_loss": 0.31223604, "eval_token_acc": 0.74525475, "eval_runtime": 1.7841, "eval_samples_per_second": 2.242, "eval_steps_per_second": 2.242, "epoch": 3.83838384, "global_step/max_steps": "760/990", "percentage": "76.77%", "elapsed_time": "1h 0m 55s", "remaining_time": "18m 26s"} +{"loss": 0.05923341, "token_acc": 0.93747126, "grad_norm": 0.46704805, "learning_rate": 1.348e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207991, "epoch": 3.86363636, "global_step/max_steps": "765/990", "percentage": "77.27%", "elapsed_time": "1h 1m 17s", "remaining_time": "18m 1s"} +{"loss": 0.03186772, "token_acc": 0.98956133, "grad_norm": 0.22090964, "learning_rate": 1.292e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207996, "epoch": 3.88888889, "global_step/max_steps": "770/990", "percentage": "77.78%", "elapsed_time": "1h 1m 41s", "remaining_time": "17m 37s"} +{"loss": 0.03672673, "token_acc": 0.98799027, "grad_norm": 0.48269495, "learning_rate": 1.236e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207983, "epoch": 3.91414141, "global_step/max_steps": "775/990", "percentage": "78.28%", "elapsed_time": "1h 2m 5s", "remaining_time": "17m 13s"} +{"loss": 0.04796214, "token_acc": 0.98059072, "grad_norm": 0.42812356, "learning_rate": 1.182e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207969, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "1h 2m 30s", "remaining_time": "16m 49s"} +{"eval_loss": 0.31143019, "eval_token_acc": 0.74725275, "eval_runtime": 1.7899, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 3.93939394, "global_step/max_steps": "780/990", "percentage": "78.79%", "elapsed_time": "1h 2m 31s", "remaining_time": "16m 50s"} +{"loss": 0.03378086, "token_acc": 0.93375796, "grad_norm": 0.45910347, "learning_rate": 1.128e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208066, "epoch": 3.96464646, "global_step/max_steps": "785/990", "percentage": "79.29%", "elapsed_time": "1h 2m 52s", "remaining_time": "16m 25s"} +{"loss": 0.04626538, "token_acc": 0.98586572, "grad_norm": 0.55262733, "learning_rate": 1.076e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208239, "epoch": 3.98989899, "global_step/max_steps": "790/990", "percentage": "79.80%", "elapsed_time": "1h 3m 13s", "remaining_time": "16m 0s"} +{"loss": 0.03309385, "token_acc": 0.98944507, "grad_norm": 0.28537101, "learning_rate": 1.025e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208413, "epoch": 4.01515152, "global_step/max_steps": "795/990", "percentage": "80.30%", "elapsed_time": "1h 3m 34s", "remaining_time": "15m 35s"} +{"loss": 0.00834362, "token_acc": 0.99876662, "grad_norm": 0.26249537, "learning_rate": 9.75e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208448, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "1h 3m 57s", "remaining_time": "15m 11s"} +{"eval_loss": 0.3129116, "eval_token_acc": 0.74425574, "eval_runtime": 1.8149, "eval_samples_per_second": 2.204, "eval_steps_per_second": 2.204, "epoch": 4.04040404, "global_step/max_steps": "800/990", "percentage": "80.81%", "elapsed_time": "1h 3m 59s", "remaining_time": "15m 11s"} +{"loss": 0.01603064, "token_acc": 0.96277133, "grad_norm": 0.37422115, "learning_rate": 9.26e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208215, "epoch": 4.06565657, "global_step/max_steps": "805/990", "percentage": "81.31%", "elapsed_time": "1h 4m 25s", "remaining_time": "14m 48s"} +{"loss": 0.00266502, "token_acc": 0.99980357, "grad_norm": 0.01700466, "learning_rate": 8.78e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208521, "epoch": 4.09090909, "global_step/max_steps": "810/990", "percentage": "81.82%", "elapsed_time": "1h 4m 44s", "remaining_time": "14m 23s"} +{"loss": 0.01716046, "token_acc": 0.99316005, "grad_norm": 0.34164453, "learning_rate": 8.31e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208645, "epoch": 4.11616162, "global_step/max_steps": "815/990", "percentage": "82.32%", "elapsed_time": "1h 5m 5s", "remaining_time": "13m 58s"} +{"loss": 0.00726497, "token_acc": 0.99859507, "grad_norm": 0.02550986, "learning_rate": 7.86e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208653, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "1h 5m 29s", "remaining_time": "13m 34s"} +{"eval_loss": 0.31731296, "eval_token_acc": 0.74225774, "eval_runtime": 1.7956, "eval_samples_per_second": 2.228, "eval_steps_per_second": 2.228, "epoch": 4.14141414, "global_step/max_steps": "820/990", "percentage": "82.83%", "elapsed_time": "1h 5m 31s", "remaining_time": "13m 35s"} +{"loss": 0.02214038, "token_acc": 0.96700453, "grad_norm": 0.2720685, "learning_rate": 7.41e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208283, "epoch": 4.16666667, "global_step/max_steps": "825/990", "percentage": "83.33%", "elapsed_time": "1h 6m 0s", "remaining_time": "13m 12s"} +{"loss": 0.04925654, "token_acc": 0.96120627, "grad_norm": 0.34120539, "learning_rate": 6.98e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208304, "epoch": 4.19191919, "global_step/max_steps": "830/990", "percentage": "83.84%", "elapsed_time": "1h 6m 24s", "remaining_time": "12m 48s"} +{"loss": 0.02585154, "token_acc": 0.99150251, "grad_norm": 0.37129018, "learning_rate": 6.56e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208305, "epoch": 4.21717172, "global_step/max_steps": "835/990", "percentage": "84.34%", "elapsed_time": "1h 6m 48s", "remaining_time": "12m 24s"} +{"loss": 0.02727468, "token_acc": 0.98676601, "grad_norm": 0.31404537, "learning_rate": 6.15e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208162, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "1h 7m 14s", "remaining_time": "12m 0s"} +{"eval_loss": 0.32319602, "eval_token_acc": 0.74325674, "eval_runtime": 1.7959, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 4.24242424, "global_step/max_steps": "840/990", "percentage": "84.85%", "elapsed_time": "1h 7m 16s", "remaining_time": "12m 0s"} +{"loss": 0.07043209, "token_acc": 0.94830711, "grad_norm": 0.74358189, "learning_rate": 5.76e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207659, "epoch": 4.26767677, "global_step/max_steps": "845/990", "percentage": "85.35%", "elapsed_time": "1h 7m 48s", "remaining_time": "11m 38s"} +{"loss": 0.04966149, "token_acc": 0.98177746, "grad_norm": 0.39616713, "learning_rate": 5.37e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207405, "epoch": 4.29292929, "global_step/max_steps": "850/990", "percentage": "85.86%", "elapsed_time": "1h 8m 17s", "remaining_time": "11m 14s"} +{"loss": 0.01363417, "token_acc": 0.99486788, "grad_norm": 0.12084543, "learning_rate": 5e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207257, "epoch": 4.31818182, "global_step/max_steps": "855/990", "percentage": "86.36%", "elapsed_time": "1h 8m 44s", "remaining_time": "10m 51s"} +{"loss": 0.01033109, "token_acc": 0.99758389, "grad_norm": 0.18208143, "learning_rate": 4.65e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207206, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "1h 9m 10s", "remaining_time": "10m 27s"} +{"eval_loss": 0.32810417, "eval_token_acc": 0.74525475, "eval_runtime": 1.824, "eval_samples_per_second": 2.193, "eval_steps_per_second": 2.193, "epoch": 4.34343434, "global_step/max_steps": "860/990", "percentage": "86.87%", "elapsed_time": "1h 9m 11s", "remaining_time": "10m 27s"} +{"loss": 0.00932074, "token_acc": 0.96391683, "grad_norm": 0.1977299, "learning_rate": 4.3e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207088, "epoch": 4.36868687, "global_step/max_steps": "865/990", "percentage": "87.37%", "elapsed_time": "1h 9m 36s", "remaining_time": "10m 3s"} +{"loss": 0.00707825, "token_acc": 0.99818302, "grad_norm": 0.20129217, "learning_rate": 3.97e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207247, "epoch": 4.39393939, "global_step/max_steps": "870/990", "percentage": "87.88%", "elapsed_time": "1h 9m 57s", "remaining_time": "9m 38s"} +{"loss": 0.00643899, "token_acc": 0.99709921, "grad_norm": 0.15680422, "learning_rate": 3.65e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207512, "epoch": 4.41919192, "global_step/max_steps": "875/990", "percentage": "88.38%", "elapsed_time": "1h 10m 16s", "remaining_time": "9m 14s"} +{"loss": 0.01718273, "token_acc": 0.99439479, "grad_norm": 0.12446325, "learning_rate": 3.34e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207596, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "1h 10m 38s", "remaining_time": "8m 49s"} +{"eval_loss": 0.33027837, "eval_token_acc": 0.74325674, "eval_runtime": 1.8139, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "epoch": 4.44444444, "global_step/max_steps": "880/990", "percentage": "88.89%", "elapsed_time": "1h 10m 40s", "remaining_time": "8m 50s"} +{"loss": 0.0076538, "token_acc": 0.96541527, "grad_norm": 0.21941973, "learning_rate": 3.05e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207479, "epoch": 4.46969697, "global_step/max_steps": "885/990", "percentage": "89.39%", "elapsed_time": "1h 11m 5s", "remaining_time": "8m 26s"} +{"loss": 0.01345368, "token_acc": 0.99713506, "grad_norm": 0.41804647, "learning_rate": 2.77e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207427, "epoch": 4.49494949, "global_step/max_steps": "890/990", "percentage": "89.90%", "elapsed_time": "1h 11m 30s", "remaining_time": "8m 2s"} +{"loss": 0.02295569, "token_acc": 0.99321964, "grad_norm": 0.36694837, "learning_rate": 2.5e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207683, "epoch": 4.52020202, "global_step/max_steps": "895/990", "percentage": "90.40%", "elapsed_time": "1h 11m 49s", "remaining_time": "7m 37s"} +{"loss": 0.03834372, "token_acc": 0.98764487, "grad_norm": 0.30651757, "learning_rate": 2.24e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207506, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "1h 12m 16s", "remaining_time": "7m 13s"} +{"eval_loss": 0.33111608, "eval_token_acc": 0.74025974, "eval_runtime": 1.8047, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 4.54545455, "global_step/max_steps": "900/990", "percentage": "90.91%", "elapsed_time": "1h 12m 18s", "remaining_time": "7m 13s"} +{"loss": 0.00275137, "token_acc": 0.95826142, "grad_norm": 0.06433414, "learning_rate": 2e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207542, "epoch": 4.57070707, "global_step/max_steps": "905/990", "percentage": "91.41%", "elapsed_time": "1h 12m 40s", "remaining_time": "6m 49s"} +{"loss": 0.00754023, "token_acc": 0.99844277, "grad_norm": 0.0814418, "learning_rate": 1.78e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207932, "epoch": 4.5959596, "global_step/max_steps": "910/990", "percentage": "91.92%", "elapsed_time": "1h 12m 56s", "remaining_time": "6m 24s"} +{"loss": 0.01742658, "token_acc": 0.99344812, "grad_norm": 0.01438748, "learning_rate": 1.56e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208031, "epoch": 4.62121212, "global_step/max_steps": "915/990", "percentage": "92.42%", "elapsed_time": "1h 13m 18s", "remaining_time": "6m 0s"} +{"loss": 0.00543078, "token_acc": 0.99783924, "grad_norm": 0.2927981, "learning_rate": 1.36e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208309, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "1h 13m 36s", "remaining_time": "5m 36s"} +{"eval_loss": 0.33348954, "eval_token_acc": 0.73726274, "eval_runtime": 1.796, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 4.64646465, "global_step/max_steps": "920/990", "percentage": "92.93%", "elapsed_time": "1h 13m 37s", "remaining_time": "5m 36s"} +{"loss": 0.00238446, "token_acc": 0.95586047, "grad_norm": 0.14231211, "learning_rate": 1.18e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208319, "epoch": 4.67171717, "global_step/max_steps": "925/990", "percentage": "93.43%", "elapsed_time": "1h 13m 59s", "remaining_time": "5m 11s"} +{"loss": 0.01498995, "token_acc": 0.9928401, "grad_norm": 0.00572622, "learning_rate": 1e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208549, "epoch": 4.6969697, "global_step/max_steps": "930/990", "percentage": "93.94%", "elapsed_time": "1h 14m 19s", "remaining_time": "4m 47s"} +{"loss": 0.02435753, "token_acc": 0.98951429, "grad_norm": 0.42016208, "learning_rate": 8.4e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208233, "epoch": 4.72222222, "global_step/max_steps": "935/990", "percentage": "94.44%", "elapsed_time": "1h 14m 49s", "remaining_time": "4m 24s"} +{"loss": 0.01552925, "token_acc": 0.99368036, "grad_norm": 0.15343283, "learning_rate": 7e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208371, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "1h 15m 10s", "remaining_time": "3m 59s"} +{"eval_loss": 0.33328468, "eval_token_acc": 0.74125874, "eval_runtime": 1.8041, "eval_samples_per_second": 2.217, "eval_steps_per_second": 2.217, "epoch": 4.74747475, "global_step/max_steps": "940/990", "percentage": "94.95%", "elapsed_time": "1h 15m 12s", "remaining_time": "4m 0s"} +{"loss": 0.01205244, "token_acc": 0.97242111, "grad_norm": 0.08844652, "learning_rate": 5.6e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207774, "epoch": 4.77272727, "global_step/max_steps": "945/990", "percentage": "95.45%", "elapsed_time": "1h 15m 47s", "remaining_time": "3m 36s"} +{"loss": 0.01021889, "token_acc": 0.99422984, "grad_norm": 0.01477067, "learning_rate": 4.5e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207847, "epoch": 4.7979798, "global_step/max_steps": "950/990", "percentage": "95.96%", "elapsed_time": "1h 16m 10s", "remaining_time": "3m 12s"} +{"loss": 0.00310982, "token_acc": 0.99893927, "grad_norm": 0.14186975, "learning_rate": 3.4e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207847, "epoch": 4.82323232, "global_step/max_steps": "955/990", "percentage": "96.46%", "elapsed_time": "1h 16m 34s", "remaining_time": "2m 48s"} +{"loss": 0.02708366, "token_acc": 0.99180203, "grad_norm": 0.34661114, "learning_rate": 2.5e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207931, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "1h 16m 56s", "remaining_time": "2m 24s"} +{"eval_loss": 0.33344024, "eval_token_acc": 0.74125874, "eval_runtime": 1.7787, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 4.84848485, "global_step/max_steps": "960/990", "percentage": "96.97%", "elapsed_time": "1h 16m 58s", "remaining_time": "2m 24s"} +{"loss": 0.01345811, "token_acc": 0.95346198, "grad_norm": 0.18189794, "learning_rate": 1.7e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207945, "epoch": 4.87373737, "global_step/max_steps": "965/990", "percentage": "97.47%", "elapsed_time": "1h 17m 20s", "remaining_time": "2m 0s"} +{"loss": 0.00872324, "token_acc": 0.99732844, "grad_norm": 0.23113778, "learning_rate": 1.1e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.20813, "epoch": 4.8989899, "global_step/max_steps": "970/990", "percentage": "97.98%", "elapsed_time": "1h 17m 40s", "remaining_time": "1m 36s"} +{"loss": 0.00528367, "token_acc": 0.99857839, "grad_norm": 0.08570379, "learning_rate": 6e-08, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208414, "epoch": 4.92424242, "global_step/max_steps": "975/990", "percentage": "98.48%", "elapsed_time": "1h 17m 57s", "remaining_time": "1m 11s"} +{"loss": 0.03540944, "token_acc": 0.98341269, "grad_norm": 0.05370381, "learning_rate": 3e-08, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208454, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "1h 18m 20s", "remaining_time": "47s"} +{"eval_loss": 0.33373889, "eval_token_acc": 0.73726274, "eval_runtime": 1.7669, "eval_samples_per_second": 2.264, "eval_steps_per_second": 2.264, "epoch": 4.94949495, "global_step/max_steps": "980/990", "percentage": "98.99%", "elapsed_time": "1h 18m 22s", "remaining_time": "47s"} +{"loss": 0.06211912, "token_acc": 0.95004783, "grad_norm": 0.2665289, "learning_rate": 1e-08, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208152, "epoch": 4.97474747, "global_step/max_steps": "985/990", "percentage": "99.49%", "elapsed_time": "1h 18m 51s", "remaining_time": "24s"} +{"loss": 0.00358277, "token_acc": 0.99951346, "grad_norm": 0.14736372, "learning_rate": 0.0, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208263, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 13s", "remaining_time": "0s"} +{"eval_loss": 0.33240235, "eval_token_acc": 0.74325674, "eval_runtime": 1.7947, "eval_samples_per_second": 2.229, "eval_steps_per_second": 2.229, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 15s", "remaining_time": "0s"} +{"train_runtime": 4757.6516, "train_samples_per_second": 0.416, "train_steps_per_second": 0.208, "total_flos": 6.310258299632026e+17, "train_loss": 0.19778904, "epoch": 5.0, "global_step/max_steps": "990/990", "percentage": "100.00%", "elapsed_time": "1h 19m 17s", "remaining_time": "0s"} +{"train_dataset": "761.964646±623.950261, min=40.000000, max=4021.000000, size=396", "val_dataset": "307.500000±311.674911, min=84.000000, max=841.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 70760.8003M Params (207.0938M Trainable [0.2927%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-990", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/checkpoint-560", "best_metric": 0.28748947, "global_step": 990, "log_history": [{"loss": 0.5845019221305847, "token_acc": 0.8486238532110092, "grad_norm": 0.5212649703025818, "learning_rate": 2.0000000000000003e-06, "memory(GiB)": 143.92, "train_speed(iter/s)": 0.152386, "epoch": 0.005050505050505051, "step": 1}, {"loss": 0.7612156867980957, "token_acc": 0.8290306867998052, "grad_norm": 0.8231858015060425, "learning_rate": 1e-05, "memory(GiB)": 153.24, "train_speed(iter/s)": 0.221696, "epoch": 0.025252525252525252, "step": 5}, {"loss": 0.8103227615356445, "token_acc": 0.7863496684457383, "grad_norm": 0.613670825958252, "learning_rate": 2e-05, "memory(GiB)": 160.15, "train_speed(iter/s)": 0.228033, "epoch": 0.050505050505050504, "step": 10}, {"loss": 0.7224256038665772, "token_acc": 0.8002382843526609, "grad_norm": 0.5065694451332092, "learning_rate": 3e-05, "memory(GiB)": 169.88, "train_speed(iter/s)": 0.222881, "epoch": 0.07575757575757576, "step": 15}, {"loss": 0.783719539642334, "token_acc": 0.865086333040679, "grad_norm": 5.616745471954346, "learning_rate": 4e-05, "memory(GiB)": 169.88, "train_speed(iter/s)": 0.240866, "epoch": 0.10101010101010101, "step": 20}, {"eval_loss": 1.1500141620635986, "eval_token_acc": 0.7232767232767233, "eval_runtime": 1.8151, "eval_samples_per_second": 2.204, "eval_steps_per_second": 2.204, "epoch": 0.10101010101010101, "step": 20}, {"loss": 0.5110222339630127, "token_acc": 0.8279441117764471, "grad_norm": 0.4805212914943695, "learning_rate": 5e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.217821, "epoch": 0.12626262626262627, "step": 25}, {"loss": 0.5762582778930664, "token_acc": 0.8128789462680326, "grad_norm": 1.0994250774383545, "learning_rate": 6e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.225877, "epoch": 0.15151515151515152, "step": 30}, {"loss": 0.3708608150482178, "token_acc": 0.8526220614828209, "grad_norm": 0.30022498965263367, "learning_rate": 7e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.229602, "epoch": 0.17676767676767677, "step": 35}, {"loss": 0.44199090003967284, "token_acc": 0.8467268299670534, "grad_norm": 0.31309282779693604, "learning_rate": 8e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.229371, "epoch": 0.20202020202020202, "step": 40}, {"eval_loss": 0.5262672305107117, "eval_token_acc": 0.7352647352647352, "eval_runtime": 1.7976, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 0.20202020202020202, "step": 40}, {"loss": 0.4439223289489746, "token_acc": 0.837237851662404, "grad_norm": 0.4576653838157654, "learning_rate": 9e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.220221, "epoch": 0.22727272727272727, "step": 45}, {"loss": 0.5241156101226807, "token_acc": 0.8669076569175156, "grad_norm": 0.3684069514274597, "learning_rate": 0.0001, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.215242, "epoch": 0.25252525252525254, "step": 50}, {"loss": 0.4691337585449219, "token_acc": 0.8475441501103753, "grad_norm": 0.4167088270187378, "learning_rate": 9.999301905929286e-05, "memory(GiB)": 178.91, "train_speed(iter/s)": 0.215104, "epoch": 0.2777777777777778, "step": 55}, {"loss": 0.36278769969940183, "token_acc": 0.8557241379310345, "grad_norm": 0.4622468948364258, "learning_rate": 9.997207818651274e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.220608, "epoch": 0.30303030303030304, "step": 60}, {"eval_loss": 0.4990268349647522, "eval_token_acc": 0.7402597402597403, "eval_runtime": 1.7923, "eval_samples_per_second": 2.232, "eval_steps_per_second": 2.232, "epoch": 0.30303030303030304, "step": 60}, {"loss": 0.5440140247344971, "token_acc": 0.8285714285714286, "grad_norm": 0.6556711792945862, "learning_rate": 9.99371832291393e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.215587, "epoch": 0.3282828282828283, "step": 65}, {"loss": 0.40032358169555665, "token_acc": 0.8736349453978159, "grad_norm": 0.27956634759902954, "learning_rate": 9.988834393115767e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.213465, "epoch": 0.35353535353535354, "step": 70}, {"loss": 0.5715262413024902, "token_acc": 0.833641404805915, "grad_norm": 0.4587284028530121, "learning_rate": 9.982557393033758e-05, "memory(GiB)": 178.92, "train_speed(iter/s)": 0.217719, "epoch": 0.3787878787878788, "step": 75}, {"loss": 0.6002557277679443, "token_acc": 0.8368476147749364, "grad_norm": 0.3916527330875397, "learning_rate": 9.974889075442521e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.218182, "epoch": 0.40404040404040403, "step": 80}, {"eval_loss": 0.5162495970726013, "eval_token_acc": 0.7392607392607392, "eval_runtime": 1.8197, "eval_samples_per_second": 2.198, "eval_steps_per_second": 2.198, "epoch": 0.40404040404040403, "step": 80}, {"loss": 0.5449264526367188, "token_acc": 0.8142857142857143, "grad_norm": 0.27539631724357605, "learning_rate": 9.965831581624871e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.214816, "epoch": 0.4292929292929293, "step": 85}, {"loss": 0.43953518867492675, "token_acc": 0.8481365377917102, "grad_norm": 0.6312692761421204, "learning_rate": 9.9553874407739e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.216199, "epoch": 0.45454545454545453, "step": 90}, {"loss": 0.3530363798141479, "token_acc": 0.8667912439935932, "grad_norm": 0.33478057384490967, "learning_rate": 9.94355956928673e-05, "memory(GiB)": 178.95, "train_speed(iter/s)": 0.215501, "epoch": 0.4797979797979798, "step": 95}, {"loss": 0.4162618637084961, "token_acc": 0.8655149051490515, "grad_norm": 0.2933506965637207, "learning_rate": 9.930351269950143e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212789, "epoch": 0.5050505050505051, "step": 100}, {"eval_loss": 0.5181649327278137, "eval_token_acc": 0.7402597402597403, "eval_runtime": 1.7982, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 0.5050505050505051, "step": 100}, {"loss": 0.5441759586334228, "token_acc": 0.8028541226215645, "grad_norm": 0.48549583554267883, "learning_rate": 9.915766231018318e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212013, "epoch": 0.5303030303030303, "step": 105}, {"loss": 0.5125463962554931, "token_acc": 0.8068669527896996, "grad_norm": 1.273707628250122, "learning_rate": 9.899808525182935e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.216603, "epoch": 0.5555555555555556, "step": 110}, {"loss": 0.45648856163024903, "token_acc": 0.8614628614628614, "grad_norm": 0.24062682688236237, "learning_rate": 9.882482608435923e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.213323, "epoch": 0.5808080808080808, "step": 115}, {"loss": 0.5108787059783936, "token_acc": 0.8145441030723488, "grad_norm": 0.7656003832817078, "learning_rate": 9.863793318825186e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212345, "epoch": 0.6060606060606061, "step": 120}, {"eval_loss": 0.49168646335601807, "eval_token_acc": 0.7402597402597403, "eval_runtime": 1.7863, "eval_samples_per_second": 2.239, "eval_steps_per_second": 2.239, "epoch": 0.6060606060606061, "step": 120}, {"loss": 0.4693441867828369, "token_acc": 0.8352281825460368, "grad_norm": 0.3627430200576782, "learning_rate": 9.843745875103627e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.212255, "epoch": 0.6313131313131313, "step": 125}, {"loss": 0.47632036209106443, "token_acc": 0.8450905624404195, "grad_norm": 0.39431232213974, "learning_rate": 9.822345875271883e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211377, "epoch": 0.6565656565656566, "step": 130}, {"loss": 0.3673699378967285, "token_acc": 0.8731134712129681, "grad_norm": 0.22374582290649414, "learning_rate": 9.799599295015154e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211464, "epoch": 0.6818181818181818, "step": 135}, {"loss": 0.5292303085327148, "token_acc": 0.8273034877667881, "grad_norm": 0.3889102041721344, "learning_rate": 9.775512486034563e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.211083, "epoch": 0.7070707070707071, "step": 140}, {"eval_loss": 0.43900543451309204, "eval_token_acc": 0.7422577422577422, "eval_runtime": 1.8334, "eval_samples_per_second": 2.182, "eval_steps_per_second": 2.182, "epoch": 0.7070707070707071, "step": 140}, {"loss": 0.34286372661590575, "token_acc": 0.8596546310832025, "grad_norm": 0.5705273151397705, "learning_rate": 9.750092174273521e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.208774, "epoch": 0.7323232323232324, "step": 145}, {"loss": 0.4045823097229004, "token_acc": 0.8724293596388564, "grad_norm": 0.9250723123550415, "learning_rate": 9.723345458039594e-05, "memory(GiB)": 194.37, "train_speed(iter/s)": 0.20966, "epoch": 0.7575757575757576, "step": 150}, {"loss": 0.4202705383300781, "token_acc": 0.8564288391853055, "grad_norm": 0.4707590639591217, "learning_rate": 9.69527980602239e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209281, "epoch": 0.7828282828282829, "step": 155}, {"loss": 0.34715895652770995, "token_acc": 0.8820569271898098, "grad_norm": 0.4233284890651703, "learning_rate": 9.665903055208014e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210051, "epoch": 0.8080808080808081, "step": 160}, {"eval_loss": 0.4276379346847534, "eval_token_acc": 0.7472527472527473, "eval_runtime": 1.8048, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 0.8080808080808081, "step": 160}, {"loss": 0.4147395133972168, "token_acc": 0.8452348628835189, "grad_norm": 0.2773014008998871, "learning_rate": 9.635223408690688e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209147, "epoch": 0.8333333333333334, "step": 165}, {"loss": 0.4583017349243164, "token_acc": 0.853536021150033, "grad_norm": 0.4066998064517975, "learning_rate": 9.603249433382144e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208696, "epoch": 0.8585858585858586, "step": 170}, {"loss": 0.4163652896881104, "token_acc": 0.8529512111907199, "grad_norm": 0.49174752831459045, "learning_rate": 9.569990057619414e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209604, "epoch": 0.8838383838383839, "step": 175}, {"loss": 0.422959041595459, "token_acc": 0.8615751789976134, "grad_norm": 0.3412103056907654, "learning_rate": 9.535454568671704e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.21007, "epoch": 0.9090909090909091, "step": 180}, {"eval_loss": 0.4359378516674042, "eval_token_acc": 0.7552447552447552, "eval_runtime": 1.7765, "eval_samples_per_second": 2.252, "eval_steps_per_second": 2.252, "epoch": 0.9090909090909091, "step": 180}, {"loss": 0.4941267967224121, "token_acc": 0.8200705112062453, "grad_norm": 0.481300413608551, "learning_rate": 9.49965261014704e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20925, "epoch": 0.9343434343434344, "step": 185}, {"loss": 0.7393960952758789, "token_acc": 0.799098337559876, "grad_norm": 2.57511305809021, "learning_rate": 9.462594179299406e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210972, "epoch": 0.9595959595959596, "step": 190}, {"loss": 0.5489778041839599, "token_acc": 0.8190687361419069, "grad_norm": 0.27773517370224, "learning_rate": 9.424289624237144e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210161, "epoch": 0.9848484848484849, "step": 195}, {"loss": 0.4746543407440186, "token_acc": 0.8563213924935893, "grad_norm": 0.2995460033416748, "learning_rate": 9.384749641033359e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207474, "epoch": 1.0101010101010102, "step": 200}, {"eval_loss": 0.4713101387023926, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.7784, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 1.0101010101010102, "step": 200}, {"loss": 0.4258098602294922, "token_acc": 0.8379081675480567, "grad_norm": 0.2918975055217743, "learning_rate": 9.343985270739182e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204959, "epoch": 1.0353535353535352, "step": 205}, {"loss": 0.3576375722885132, "token_acc": 0.8768303186907838, "grad_norm": 0.4730541408061981, "learning_rate": 9.302007896300698e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204647, "epoch": 1.0606060606060606, "step": 210}, {"loss": 0.330736780166626, "token_acc": 0.8893819007326386, "grad_norm": 0.3962819576263428, "learning_rate": 9.25882923938038e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203729, "epoch": 1.0858585858585859, "step": 215}, {"loss": 0.29806084632873536, "token_acc": 0.9017960602549246, "grad_norm": 0.551486074924469, "learning_rate": 9.214461357083985e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204164, "epoch": 1.1111111111111112, "step": 220}, {"eval_loss": 0.4675810933113098, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.7981, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "epoch": 1.1111111111111112, "step": 220}, {"loss": 0.42431864738464353, "token_acc": 0.8460293607675413, "grad_norm": 0.5311359167098999, "learning_rate": 9.168916638593736e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203358, "epoch": 1.1363636363636362, "step": 225}, {"loss": 0.36201488971710205, "token_acc": 0.8752454042477245, "grad_norm": 0.39372366666793823, "learning_rate": 9.122207801708802e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.202013, "epoch": 1.1616161616161615, "step": 230}, {"loss": 0.1575523853302002, "token_acc": 0.9267042542286007, "grad_norm": 0.48343992233276367, "learning_rate": 9.074347889294016e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203514, "epoch": 1.1868686868686869, "step": 235}, {"loss": 0.38658602237701417, "token_acc": 0.8714543367765207, "grad_norm": 0.7413302659988403, "learning_rate": 9.025350265637815e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204106, "epoch": 1.2121212121212122, "step": 240}, {"eval_loss": 0.5141359567642212, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.7909, "eval_samples_per_second": 2.233, "eval_steps_per_second": 2.233, "epoch": 1.2121212121212122, "step": 240}, {"loss": 0.24457972049713134, "token_acc": 0.8632831873036866, "grad_norm": 0.4979130029678345, "learning_rate": 8.975228612720416e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204329, "epoch": 1.2373737373737375, "step": 245}, {"loss": 0.38393685817718504, "token_acc": 0.8645948945615982, "grad_norm": 0.7785468101501465, "learning_rate": 8.923996926393305e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205736, "epoch": 1.2626262626262625, "step": 250}, {"loss": 0.34473409652709963, "token_acc": 0.8690122539918307, "grad_norm": 0.429641991853714, "learning_rate": 8.871669512471068e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.204547, "epoch": 1.2878787878787878, "step": 255}, {"loss": 0.32251389026641847, "token_acc": 0.8750648901193978, "grad_norm": 0.6987989544868469, "learning_rate": 8.818260982736661e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.2052, "epoch": 1.3131313131313131, "step": 260}, {"eval_loss": 0.4578605592250824, "eval_token_acc": 0.7602397602397603, "eval_runtime": 1.7868, "eval_samples_per_second": 2.239, "eval_steps_per_second": 2.239, "epoch": 1.3131313131313131, "step": 260}, {"loss": 0.2529636859893799, "token_acc": 0.899982859101817, "grad_norm": 0.22191783785820007, "learning_rate": 8.763786250861256e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203055, "epoch": 1.3383838383838385, "step": 265}, {"loss": 0.23594443798065184, "token_acc": 0.9083276216586703, "grad_norm": 0.4377196431159973, "learning_rate": 8.708260528239788e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20368, "epoch": 1.3636363636363638, "step": 270}, {"loss": 0.25529866218566893, "token_acc": 0.9052517596101787, "grad_norm": 0.3827890455722809, "learning_rate": 8.651699319743347e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203283, "epoch": 1.3888888888888888, "step": 275}, {"loss": 0.3728013038635254, "token_acc": 0.8851649320867878, "grad_norm": 0.8312171697616577, "learning_rate": 8.594118419389647e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.203978, "epoch": 1.4141414141414141, "step": 280}, {"eval_loss": 0.4208544194698334, "eval_token_acc": 0.7592407592407593, "eval_runtime": 1.7793, "eval_samples_per_second": 2.248, "eval_steps_per_second": 2.248, "epoch": 1.4141414141414141, "step": 280}, {"loss": 0.17479790449142457, "token_acc": 0.9063637940003468, "grad_norm": 0.7446438074111938, "learning_rate": 8.535533905932738e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20415, "epoch": 1.4393939393939394, "step": 285}, {"loss": 0.33746435642242434, "token_acc": 0.8648478488982162, "grad_norm": 0.7986549139022827, "learning_rate": 8.475962138373213e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205059, "epoch": 1.4646464646464645, "step": 290}, {"loss": 0.3507548332214355, "token_acc": 0.8484919335983165, "grad_norm": 0.4530141353607178, "learning_rate": 8.415419751390155e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206092, "epoch": 1.4898989898989898, "step": 295}, {"loss": 0.2767606496810913, "token_acc": 0.8876570583887657, "grad_norm": 1.5765312910079956, "learning_rate": 8.353923650696118e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206251, "epoch": 1.5151515151515151, "step": 300}, {"eval_loss": 0.4114963114261627, "eval_token_acc": 0.7542457542457542, "eval_runtime": 1.7835, "eval_samples_per_second": 2.243, "eval_steps_per_second": 2.243, "epoch": 1.5151515151515151, "step": 300}, {"loss": 0.33141241073608396, "token_acc": 0.8746642793196061, "grad_norm": 0.5467161536216736, "learning_rate": 8.291491008316409e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205011, "epoch": 1.5404040404040404, "step": 305}, {"loss": 0.2601468086242676, "token_acc": 0.9101903695408735, "grad_norm": 0.5182997584342957, "learning_rate": 8.228139257794012e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.205931, "epoch": 1.5656565656565657, "step": 310}, {"loss": 0.22466778755187988, "token_acc": 0.9071316614420063, "grad_norm": 1.4313561916351318, "learning_rate": 8.163886089321493e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206665, "epoch": 1.5909090909090908, "step": 315}, {"loss": 0.2971429586410522, "token_acc": 0.9094011790257525, "grad_norm": 0.4848739504814148, "learning_rate": 8.098749444801224e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206913, "epoch": 1.6161616161616161, "step": 320}, {"eval_loss": 0.34123557806015015, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.7732, "eval_samples_per_second": 2.256, "eval_steps_per_second": 2.256, "epoch": 1.6161616161616161, "step": 320}, {"loss": 0.30359759330749514, "token_acc": 0.8647662793839502, "grad_norm": 0.5274702906608582, "learning_rate": 8.032747512835337e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.206592, "epoch": 1.6414141414141414, "step": 325}, {"loss": 0.369882869720459, "token_acc": 0.8955607977696761, "grad_norm": 0.7116047739982605, "learning_rate": 7.965898723646776e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207439, "epoch": 1.6666666666666665, "step": 330}, {"loss": 0.2918565034866333, "token_acc": 0.89257481648786, "grad_norm": 0.46889209747314453, "learning_rate": 7.898221743932888e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207421, "epoch": 1.691919191919192, "step": 335}, {"loss": 0.20344338417053223, "token_acc": 0.9203519855595668, "grad_norm": 0.976447582244873, "learning_rate": 7.829735471652978e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208201, "epoch": 1.7171717171717171, "step": 340}, {"eval_loss": 0.34089022874832153, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.8007, "eval_samples_per_second": 2.221, "eval_steps_per_second": 2.221, "epoch": 1.7171717171717171, "step": 340}, {"loss": 0.24397382736206055, "token_acc": 0.9052574525745257, "grad_norm": 0.5278966426849365, "learning_rate": 7.760459030751284e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207523, "epoch": 1.7424242424242424, "step": 345}, {"loss": 0.18279753923416137, "token_acc": 0.9326676907322069, "grad_norm": 1.4450560808181763, "learning_rate": 7.690411765816864e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208339, "epoch": 1.7676767676767677, "step": 350}, {"loss": 0.3635742664337158, "token_acc": 0.8810979752683789, "grad_norm": 0.9007227420806885, "learning_rate": 7.619613236681843e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208407, "epoch": 1.7929292929292928, "step": 355}, {"loss": 0.2239600419998169, "token_acc": 0.908592093777279, "grad_norm": 0.3327200412750244, "learning_rate": 7.548083212959588e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208397, "epoch": 1.8181818181818183, "step": 360}, {"eval_loss": 0.3721417784690857, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8042, "eval_samples_per_second": 2.217, "eval_steps_per_second": 2.217, "epoch": 1.8181818181818183, "step": 360}, {"loss": 0.2930032253265381, "token_acc": 0.8698196001950268, "grad_norm": 0.898142397403717, "learning_rate": 7.475841668524268e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208386, "epoch": 1.8434343434343434, "step": 365}, {"loss": 0.30235629081726073, "token_acc": 0.8878266411727215, "grad_norm": 0.37038329243659973, "learning_rate": 7.402908775933419e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208339, "epoch": 1.8686868686868687, "step": 370}, {"loss": 0.3541992664337158, "token_acc": 0.8819656712908536, "grad_norm": 0.5051683187484741, "learning_rate": 7.329304900794991e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208034, "epoch": 1.893939393939394, "step": 375}, {"loss": 0.31901164054870607, "token_acc": 0.8962932111620159, "grad_norm": 0.716480016708374, "learning_rate": 7.255050596080509e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208147, "epoch": 1.9191919191919191, "step": 380}, {"eval_loss": 0.31777071952819824, "eval_token_acc": 0.7632367632367633, "eval_runtime": 1.8015, "eval_samples_per_second": 2.22, "eval_steps_per_second": 2.22, "epoch": 1.9191919191919191, "step": 380}, {"loss": 0.3302799701690674, "token_acc": 0.8846450617283951, "grad_norm": 0.24409626424312592, "learning_rate": 7.180166596385914e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207591, "epoch": 1.9444444444444444, "step": 385}, {"loss": 0.23348629474639893, "token_acc": 0.9038998906572713, "grad_norm": 0.41487976908683777, "learning_rate": 7.104673812141675e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207441, "epoch": 1.9696969696969697, "step": 390}, {"loss": 0.19255179166793823, "token_acc": 0.9186485885752389, "grad_norm": 0.3993523418903351, "learning_rate": 7.02859332377382e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208111, "epoch": 1.9949494949494948, "step": 395}, {"loss": 0.17468175888061524, "token_acc": 0.9561740243122201, "grad_norm": 0.3870049715042114, "learning_rate": 6.951946375817474e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209087, "epoch": 2.0202020202020203, "step": 400}, {"eval_loss": 0.28842809796333313, "eval_token_acc": 0.7612387612387612, "eval_runtime": 1.7921, "eval_samples_per_second": 2.232, "eval_steps_per_second": 2.232, "epoch": 2.0202020202020203, "step": 400}, {"loss": 0.1243563175201416, "token_acc": 0.9203394470298385, "grad_norm": 0.8590208292007446, "learning_rate": 6.874754370984606e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208801, "epoch": 2.0454545454545454, "step": 405}, {"loss": 0.12814103364944457, "token_acc": 0.9328753399169887, "grad_norm": 1.1880351305007935, "learning_rate": 6.797038864187564e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208916, "epoch": 2.0707070707070705, "step": 410}, {"loss": 0.1351101279258728, "token_acc": 0.9418245923314236, "grad_norm": 0.5621036887168884, "learning_rate": 6.718821556520151e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20947, "epoch": 2.095959595959596, "step": 415}, {"loss": 0.08711874485015869, "token_acc": 0.9826417141307295, "grad_norm": 0.3390972912311554, "learning_rate": 6.640124289197845e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.21021, "epoch": 2.121212121212121, "step": 420}, {"eval_loss": 0.2877776026725769, "eval_token_acc": 0.7612387612387612, "eval_runtime": 1.7912, "eval_samples_per_second": 2.233, "eval_steps_per_second": 2.233, "epoch": 2.121212121212121, "step": 420}, {"loss": 0.14229173660278321, "token_acc": 0.9111275964391692, "grad_norm": 0.6231220960617065, "learning_rate": 6.560969037458933e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210123, "epoch": 2.1464646464646466, "step": 425}, {"loss": 0.09396474957466125, "token_acc": 0.9717786854808763, "grad_norm": 0.2991831600666046, "learning_rate": 6.481377904428171e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209878, "epoch": 2.1717171717171717, "step": 430}, {"loss": 0.1196476936340332, "token_acc": 0.9668395702111894, "grad_norm": 0.2828837037086487, "learning_rate": 6.401373114944781e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209161, "epoch": 2.196969696969697, "step": 435}, {"loss": 0.13254005908966066, "token_acc": 0.9543882759756298, "grad_norm": 0.8178226947784424, "learning_rate": 6.320977009356431e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209483, "epoch": 2.2222222222222223, "step": 440}, {"eval_loss": 0.3008834719657898, "eval_token_acc": 0.7562437562437563, "eval_runtime": 1.7791, "eval_samples_per_second": 2.248, "eval_steps_per_second": 2.248, "epoch": 2.2222222222222223, "step": 440}, {"loss": 0.055006617307662965, "token_acc": 0.9505077773492737, "grad_norm": 0.4310154318809509, "learning_rate": 6.240212037280966e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209114, "epoch": 2.2474747474747474, "step": 445}, {"loss": 0.1947050929069519, "token_acc": 0.932483120780195, "grad_norm": 0.8850460648536682, "learning_rate": 6.159100751337642e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209501, "epoch": 2.2727272727272725, "step": 450}, {"loss": 0.0962955117225647, "token_acc": 0.9671393509680938, "grad_norm": 0.4071020185947418, "learning_rate": 6.077665800849568e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209586, "epoch": 2.297979797979798, "step": 455}, {"loss": 0.11653723716735839, "token_acc": 0.9625259129325466, "grad_norm": 0.41429275274276733, "learning_rate": 5.99592992551918e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209806, "epoch": 2.323232323232323, "step": 460}, {"eval_loss": 0.3004966974258423, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.7662, "eval_samples_per_second": 2.265, "eval_steps_per_second": 2.265, "epoch": 2.323232323232323, "step": 460}, {"loss": 0.12927284240722656, "token_acc": 0.9099099099099099, "grad_norm": 0.5336225628852844, "learning_rate": 5.913915949078452e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210062, "epoch": 2.3484848484848486, "step": 465}, {"loss": 0.08137755393981934, "token_acc": 0.9656623081296191, "grad_norm": 0.6900014281272888, "learning_rate": 5.831646772915651e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209705, "epoch": 2.3737373737373737, "step": 470}, {"loss": 0.10649137496948242, "token_acc": 0.9598332701780977, "grad_norm": 0.8796831369400024, "learning_rate": 5.749145369680407e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210106, "epoch": 2.398989898989899, "step": 475}, {"loss": 0.17189998626708985, "token_acc": 0.9404330609149886, "grad_norm": 0.33510109782218933, "learning_rate": 5.666434776868895e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209154, "epoch": 2.4242424242424243, "step": 480}, {"eval_loss": 0.3011992573738098, "eval_token_acc": 0.7582417582417582, "eval_runtime": 1.7649, "eval_samples_per_second": 2.266, "eval_steps_per_second": 2.266, "epoch": 2.4242424242424243, "step": 480}, {"loss": 0.0982659637928009, "token_acc": 0.9170192759755524, "grad_norm": 0.720079243183136, "learning_rate": 5.583538090390882e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209559, "epoch": 2.4494949494949494, "step": 485}, {"loss": 0.19064462184906006, "token_acc": 0.9257759784075573, "grad_norm": 0.5884720087051392, "learning_rate": 5.5004784581204927e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209499, "epoch": 2.474747474747475, "step": 490}, {"loss": 0.10975323915481568, "token_acc": 0.958795231123196, "grad_norm": 0.33910003304481506, "learning_rate": 5.41727907343245e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209984, "epoch": 2.5, "step": 495}, {"loss": 0.17116068601608275, "token_acc": 0.9163900944600459, "grad_norm": 0.5363466143608093, "learning_rate": 5.3339631687256084e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209844, "epoch": 2.525252525252525, "step": 500}, {"eval_loss": 0.2906345725059509, "eval_token_acc": 0.7522477522477522, "eval_runtime": 1.8092, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.525252525252525, "step": 500}, {"loss": 0.09048279523849487, "token_acc": 0.9289609432571849, "grad_norm": 0.6032365560531616, "learning_rate": 5.250554008935596e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209717, "epoch": 2.5505050505050506, "step": 505}, {"loss": 0.09625995755195618, "token_acc": 0.9664948453608248, "grad_norm": 0.6278122067451477, "learning_rate": 5.167074885038373e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210284, "epoch": 2.5757575757575757, "step": 510}, {"loss": 0.13246564865112304, "token_acc": 0.9501214574898785, "grad_norm": 0.38628754019737244, "learning_rate": 5.0835491075465045e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210549, "epoch": 2.601010101010101, "step": 515}, {"loss": 0.13336524963378907, "token_acc": 0.942733657482442, "grad_norm": 0.465506374835968, "learning_rate": 5e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210504, "epoch": 2.6262626262626263, "step": 520}, {"eval_loss": 0.2984510660171509, "eval_token_acc": 0.7502497502497503, "eval_runtime": 1.8075, "eval_samples_per_second": 2.213, "eval_steps_per_second": 2.213, "epoch": 2.6262626262626263, "step": 520}, {"loss": 0.09229624271392822, "token_acc": 0.931699604743083, "grad_norm": 0.4542198181152344, "learning_rate": 4.916450892453495e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.210556, "epoch": 2.6515151515151514, "step": 525}, {"loss": 0.16157912015914916, "token_acc": 0.9453226706341826, "grad_norm": 0.3877967596054077, "learning_rate": 4.832925114961629e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209926, "epoch": 2.676767676767677, "step": 530}, {"loss": 0.09830494523048401, "token_acc": 0.9667737290951379, "grad_norm": 0.7550373077392578, "learning_rate": 4.749445991064404e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.209454, "epoch": 2.702020202020202, "step": 535}, {"loss": 0.21627907752990722, "token_acc": 0.91897499740637, "grad_norm": 0.8958114981651306, "learning_rate": 4.666036831274392e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20902, "epoch": 2.7272727272727275, "step": 540}, {"eval_loss": 0.29240885376930237, "eval_token_acc": 0.7582417582417582, "eval_runtime": 1.8087, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.7272727272727275, "step": 540}, {"loss": 0.15035767555236818, "token_acc": 0.9105213715359324, "grad_norm": 0.6230780482292175, "learning_rate": 4.582720926567552e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208676, "epoch": 2.7525252525252526, "step": 545}, {"loss": 0.10666660070419312, "token_acc": 0.9625537139349294, "grad_norm": 0.2731610834598541, "learning_rate": 4.4995215418795085e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.20872, "epoch": 2.7777777777777777, "step": 550}, {"loss": 0.092490154504776, "token_acc": 0.9688758129451843, "grad_norm": 0.5083901286125183, "learning_rate": 4.416461909609119e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208968, "epoch": 2.8030303030303028, "step": 555}, {"loss": 0.12382739782333374, "token_acc": 0.9427140588738243, "grad_norm": 0.3543936014175415, "learning_rate": 4.333565223131107e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208825, "epoch": 2.8282828282828283, "step": 560}, {"eval_loss": 0.28748947381973267, "eval_token_acc": 0.7542457542457542, "eval_runtime": 1.8169, "eval_samples_per_second": 2.202, "eval_steps_per_second": 2.202, "epoch": 2.8282828282828283, "step": 560}, {"loss": 0.14281988143920898, "token_acc": 0.9215728176087189, "grad_norm": 0.6817888617515564, "learning_rate": 4.250854630319593e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208389, "epoch": 2.8535353535353534, "step": 565}, {"loss": 0.19629952907562256, "token_acc": 0.9246951219512195, "grad_norm": 0.6411938071250916, "learning_rate": 4.1683532270843504e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208033, "epoch": 2.878787878787879, "step": 570}, {"loss": 0.1060869812965393, "token_acc": 0.9567567567567568, "grad_norm": 0.7665055990219116, "learning_rate": 4.0860840509215496e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207971, "epoch": 2.904040404040404, "step": 575}, {"loss": 0.1460867166519165, "token_acc": 0.9379691821414461, "grad_norm": 2.561556100845337, "learning_rate": 4.0040700744808204e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208359, "epoch": 2.929292929292929, "step": 580}, {"eval_loss": 0.2931186854839325, "eval_token_acc": 0.7532467532467533, "eval_runtime": 1.8095, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "epoch": 2.929292929292929, "step": 580}, {"loss": 0.14551491737365724, "token_acc": 0.907440654298823, "grad_norm": 0.7562662363052368, "learning_rate": 3.922334199150432e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208553, "epoch": 2.9545454545454546, "step": 585}, {"loss": 0.14465657472610474, "token_acc": 0.9507758159443552, "grad_norm": 0.7358995676040649, "learning_rate": 3.840899248662358e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.208115, "epoch": 2.9797979797979797, "step": 590}, {"loss": 0.12801222801208495, "token_acc": 0.9628496042216359, "grad_norm": 0.2242085188627243, "learning_rate": 3.7597879627190334e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207756, "epoch": 3.005050505050505, "step": 595}, {"loss": 0.05944470763206482, "token_acc": 0.9819713314615044, "grad_norm": 0.5652392506599426, "learning_rate": 3.6790229906435705e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207893, "epoch": 3.0303030303030303, "step": 600}, {"eval_loss": 0.30186352133750916, "eval_token_acc": 0.7572427572427572, "eval_runtime": 1.8433, "eval_samples_per_second": 2.17, "eval_steps_per_second": 2.17, "epoch": 3.0303030303030303, "step": 600}, {"loss": 0.04340478777885437, "token_acc": 0.9578030154689642, "grad_norm": 0.6330710649490356, "learning_rate": 3.598626885055219e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207185, "epoch": 3.0555555555555554, "step": 605}, {"loss": 0.019147023558616638, "token_acc": 0.9906427990235964, "grad_norm": 0.2928420305252075, "learning_rate": 3.5186220955718306e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207564, "epoch": 3.080808080808081, "step": 610}, {"loss": 0.01184888556599617, "token_acc": 0.9950428120775124, "grad_norm": 0.22657504677772522, "learning_rate": 3.4390309625410686e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207984, "epoch": 3.106060606060606, "step": 615}, {"loss": 0.03439113795757294, "token_acc": 0.9885694884563151, "grad_norm": 0.38536500930786133, "learning_rate": 3.3598757108021546e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207846, "epoch": 3.1313131313131315, "step": 620}, {"eval_loss": 0.3092794716358185, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8078, "eval_samples_per_second": 2.213, "eval_steps_per_second": 2.213, "epoch": 3.1313131313131315, "step": 620}, {"loss": 0.033835414052009585, "token_acc": 0.9644806032344478, "grad_norm": 0.4387037754058838, "learning_rate": 3.281178443479852e-05, "memory(GiB)": 194.42, "train_speed(iter/s)": 0.207235, "epoch": 3.1565656565656566, "step": 625}, {"loss": 0.04778254330158234, "token_acc": 0.9859832635983263, "grad_norm": 0.5069209337234497, "learning_rate": 3.202961135812437e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207611, "epoch": 3.1818181818181817, "step": 630}, {"loss": 0.06957237720489502, "token_acc": 0.9698315118397086, "grad_norm": 0.7499670386314392, "learning_rate": 3.1252456290153954e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207401, "epoch": 3.207070707070707, "step": 635}, {"loss": 0.06834298372268677, "token_acc": 0.9677576941866145, "grad_norm": 0.4098314642906189, "learning_rate": 3.0480536241825263e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206982, "epoch": 3.2323232323232323, "step": 640}, {"eval_loss": 0.3071475625038147, "eval_token_acc": 0.7492507492507493, "eval_runtime": 1.8289, "eval_samples_per_second": 2.187, "eval_steps_per_second": 2.187, "epoch": 3.2323232323232323, "step": 640}, {"loss": 0.053527307510375974, "token_acc": 0.9555773168343275, "grad_norm": 0.6657339930534363, "learning_rate": 2.9714066762261823e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206307, "epoch": 3.257575757575758, "step": 645}, {"loss": 0.058514750003814696, "token_acc": 0.980971797485559, "grad_norm": 0.35414841771125793, "learning_rate": 2.895326187858326e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206557, "epoch": 3.282828282828283, "step": 650}, {"loss": 0.025739893317222595, "token_acc": 0.9937519525148392, "grad_norm": 0.41236743330955505, "learning_rate": 2.8198334036140874e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206671, "epoch": 3.308080808080808, "step": 655}, {"loss": 0.04316897392272949, "token_acc": 0.9850356294536817, "grad_norm": 0.5106225609779358, "learning_rate": 2.74494940391949e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206584, "epoch": 3.3333333333333335, "step": 660}, {"eval_loss": 0.30926382541656494, "eval_token_acc": 0.7462537462537463, "eval_runtime": 1.7766, "eval_samples_per_second": 2.251, "eval_steps_per_second": 2.251, "epoch": 3.3333333333333335, "step": 660}, {"loss": 0.044805902242660525, "token_acc": 0.9384410139127121, "grad_norm": 0.7511043548583984, "learning_rate": 2.6706950992050094e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206749, "epoch": 3.3585858585858586, "step": 665}, {"loss": 0.04475100636482239, "token_acc": 0.9816687737041719, "grad_norm": 0.7061121463775635, "learning_rate": 2.5970912240665813e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206885, "epoch": 3.3838383838383836, "step": 670}, {"loss": 0.036676472425460814, "token_acc": 0.9836784836784837, "grad_norm": 0.35980212688446045, "learning_rate": 2.5241583314757327e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207096, "epoch": 3.409090909090909, "step": 675}, {"loss": 0.05962592363357544, "token_acc": 0.9733040775278844, "grad_norm": 0.49686399102211, "learning_rate": 2.4519167870404125e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207419, "epoch": 3.4343434343434343, "step": 680}, {"eval_loss": 0.30547747015953064, "eval_token_acc": 0.7422577422577422, "eval_runtime": 1.7648, "eval_samples_per_second": 2.267, "eval_steps_per_second": 2.267, "epoch": 3.4343434343434343, "step": 680}, {"loss": 0.07604837417602539, "token_acc": 0.9316927830500993, "grad_norm": 0.36548638343811035, "learning_rate": 2.3803867633181574e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207066, "epoch": 3.45959595959596, "step": 685}, {"loss": 0.05938977003097534, "token_acc": 0.9727115716753022, "grad_norm": 0.9540855288505554, "learning_rate": 2.3095882341831372e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.206879, "epoch": 3.484848484848485, "step": 690}, {"loss": 0.05741435885429382, "token_acc": 0.9801678108314263, "grad_norm": 0.49375462532043457, "learning_rate": 2.2395409692487175e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207044, "epoch": 3.51010101010101, "step": 695}, {"loss": 0.013945281505584717, "token_acc": 0.9956945388624519, "grad_norm": 0.1865593045949936, "learning_rate": 2.1702645283470236e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207418, "epoch": 3.5353535353535355, "step": 700}, {"eval_loss": 0.30903568863868713, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.7685, "eval_samples_per_second": 2.262, "eval_steps_per_second": 2.262, "epoch": 3.5353535353535355, "step": 700}, {"loss": 0.03774779438972473, "token_acc": 0.939033597583994, "grad_norm": 0.2819386422634125, "learning_rate": 2.1017782560671123e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207569, "epoch": 3.5606060606060606, "step": 705}, {"loss": 0.03159322738647461, "token_acc": 0.9888877240800923, "grad_norm": 0.43836522102355957, "learning_rate": 2.0341012763532243e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207322, "epoch": 3.5858585858585856, "step": 710}, {"loss": 0.002558489516377449, "token_acc": 0.9991300565463245, "grad_norm": 0.6935763955116272, "learning_rate": 1.967252487164663e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.20803, "epoch": 3.611111111111111, "step": 715}, {"loss": 0.01584031879901886, "token_acc": 0.9957315627223144, "grad_norm": 0.7125621438026428, "learning_rate": 1.9012505551987765e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207895, "epoch": 3.6363636363636362, "step": 720}, {"eval_loss": 0.3177046775817871, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.7785, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 3.6363636363636362, "step": 720}, {"loss": 0.03762938678264618, "token_acc": 0.955343466478143, "grad_norm": 0.4679706394672394, "learning_rate": 1.836113910678507e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207656, "epoch": 3.6616161616161618, "step": 725}, {"loss": 0.05497429370880127, "token_acc": 0.9796425024826216, "grad_norm": 0.318162739276886, "learning_rate": 1.771860742205988e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207349, "epoch": 3.686868686868687, "step": 730}, {"loss": 0.035741007328033446, "token_acc": 0.9839179435228568, "grad_norm": 0.3959096372127533, "learning_rate": 1.7085089916835923e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207241, "epoch": 3.712121212121212, "step": 735}, {"loss": 0.045097559690475464, "token_acc": 0.987012987012987, "grad_norm": 0.6112183332443237, "learning_rate": 1.646076349303884e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207546, "epoch": 3.7373737373737375, "step": 740}, {"eval_loss": 0.31488630175590515, "eval_token_acc": 0.7482517482517482, "eval_runtime": 1.8107, "eval_samples_per_second": 2.209, "eval_steps_per_second": 2.209, "epoch": 3.7373737373737375, "step": 740}, {"loss": 0.03688704967498779, "token_acc": 0.9172789115646258, "grad_norm": 0.2885132431983948, "learning_rate": 1.584580248609846e-05, "memory(GiB)": 194.43, "train_speed(iter/s)": 0.207875, "epoch": 3.7626262626262625, "step": 745}, {"loss": 0.034329149127006534, "token_acc": 0.9898200757575758, "grad_norm": 0.7163310050964355, "learning_rate": 1.5240378616267886e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208273, "epoch": 3.787878787878788, "step": 750}, {"loss": 0.11100113391876221, "token_acc": 0.9534293604000299, "grad_norm": 0.4935724139213562, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207493, "epoch": 3.813131313131313, "step": 755}, {"loss": 0.011808309704065323, "token_acc": 0.995991448423303, "grad_norm": 0.6098226308822632, "learning_rate": 1.4058815806103542e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208006, "epoch": 3.8383838383838382, "step": 760}, {"eval_loss": 0.3122360408306122, "eval_token_acc": 0.7452547452547452, "eval_runtime": 1.7841, "eval_samples_per_second": 2.242, "eval_steps_per_second": 2.242, "epoch": 3.8383838383838382, "step": 760}, {"loss": 0.05923340916633606, "token_acc": 0.9374712643678161, "grad_norm": 0.4670480489730835, "learning_rate": 1.3483006802566544e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207991, "epoch": 3.8636363636363638, "step": 765}, {"loss": 0.03186772465705871, "token_acc": 0.9895613272026842, "grad_norm": 0.22090964019298553, "learning_rate": 1.2917394717602121e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207996, "epoch": 3.888888888888889, "step": 770}, {"loss": 0.03672673106193543, "token_acc": 0.9879902705989663, "grad_norm": 0.48269495368003845, "learning_rate": 1.2362137491387432e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207983, "epoch": 3.9141414141414144, "step": 775}, {"loss": 0.047962135076522826, "token_acc": 0.980590717299578, "grad_norm": 0.4281235635280609, "learning_rate": 1.1817390172633403e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207969, "epoch": 3.9393939393939394, "step": 780}, {"eval_loss": 0.3114301860332489, "eval_token_acc": 0.7472527472527473, "eval_runtime": 1.7899, "eval_samples_per_second": 2.235, "eval_steps_per_second": 2.235, "epoch": 3.9393939393939394, "step": 780}, {"loss": 0.03378086090087891, "token_acc": 0.9337579617834395, "grad_norm": 0.45910346508026123, "learning_rate": 1.1283304875289336e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208066, "epoch": 3.9646464646464645, "step": 785}, {"loss": 0.0462653785943985, "token_acc": 0.9858657243816255, "grad_norm": 0.5526273250579834, "learning_rate": 1.0760030736066951e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208239, "epoch": 3.98989898989899, "step": 790}, {"loss": 0.033093854784965515, "token_acc": 0.9894450663681433, "grad_norm": 0.28537100553512573, "learning_rate": 1.024771387279585e-05, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208413, "epoch": 4.015151515151516, "step": 795}, {"loss": 0.008343618363142014, "token_acc": 0.9987666164177059, "grad_norm": 0.26249536871910095, "learning_rate": 9.746497343621857e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208448, "epoch": 4.040404040404041, "step": 800}, {"eval_loss": 0.31291159987449646, "eval_token_acc": 0.7442557442557443, "eval_runtime": 1.8149, "eval_samples_per_second": 2.204, "eval_steps_per_second": 2.204, "epoch": 4.040404040404041, "step": 800}, {"loss": 0.01603064388036728, "token_acc": 0.962771327612317, "grad_norm": 0.37422114610671997, "learning_rate": 9.256521107059834e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208215, "epoch": 4.065656565656566, "step": 805}, {"loss": 0.0026650244370102884, "token_acc": 0.9998035749361619, "grad_norm": 0.01700465753674507, "learning_rate": 8.777921982911996e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208521, "epoch": 4.090909090909091, "step": 810}, {"loss": 0.017160463333129882, "token_acc": 0.9931600547195623, "grad_norm": 0.3416445255279541, "learning_rate": 8.310833614062651e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208645, "epoch": 4.116161616161616, "step": 815}, {"loss": 0.007264973968267441, "token_acc": 0.9985950671245707, "grad_norm": 0.025509856641292572, "learning_rate": 7.85538642916015e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208653, "epoch": 4.141414141414141, "step": 820}, {"eval_loss": 0.31731295585632324, "eval_token_acc": 0.7422577422577422, "eval_runtime": 1.7956, "eval_samples_per_second": 2.228, "eval_steps_per_second": 2.228, "epoch": 4.141414141414141, "step": 820}, {"loss": 0.02214038074016571, "token_acc": 0.9670045287901661, "grad_norm": 0.27206850051879883, "learning_rate": 7.4117076061961885e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208283, "epoch": 4.166666666666667, "step": 825}, {"loss": 0.04925653636455536, "token_acc": 0.9612062655540916, "grad_norm": 0.3412053883075714, "learning_rate": 6.979921036993042e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208304, "epoch": 4.191919191919192, "step": 830}, {"loss": 0.025851538777351378, "token_acc": 0.9915025106218617, "grad_norm": 0.3712901771068573, "learning_rate": 6.5601472926081766e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208305, "epoch": 4.217171717171717, "step": 835}, {"loss": 0.027274680137634278, "token_acc": 0.9867660142348754, "grad_norm": 0.31404536962509155, "learning_rate": 6.152503589666425e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208162, "epoch": 4.242424242424242, "step": 840}, {"eval_loss": 0.32319602370262146, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.7959, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 4.242424242424242, "step": 840}, {"loss": 0.07043209075927734, "token_acc": 0.9483071053886505, "grad_norm": 0.7435818910598755, "learning_rate": 5.757103757628573e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207659, "epoch": 4.267676767676767, "step": 845}, {"loss": 0.0496614933013916, "token_acc": 0.9817774610607757, "grad_norm": 0.396167129278183, "learning_rate": 5.374058207005944e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207405, "epoch": 4.292929292929293, "step": 850}, {"loss": 0.01363416612148285, "token_acc": 0.9948678750818956, "grad_norm": 0.12084542959928513, "learning_rate": 5.0034738985296095e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207257, "epoch": 4.318181818181818, "step": 855}, {"loss": 0.010331088304519653, "token_acc": 0.9975838926174496, "grad_norm": 0.1820814311504364, "learning_rate": 4.645454313282965e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207206, "epoch": 4.343434343434343, "step": 860}, {"eval_loss": 0.328104168176651, "eval_token_acc": 0.7452547452547452, "eval_runtime": 1.824, "eval_samples_per_second": 2.193, "eval_steps_per_second": 2.193, "epoch": 4.343434343434343, "step": 860}, {"loss": 0.009320738911628722, "token_acc": 0.9639168343393696, "grad_norm": 0.1977299004793167, "learning_rate": 4.3000994238058644e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207088, "epoch": 4.3686868686868685, "step": 865}, {"loss": 0.007078251987695694, "token_acc": 0.9981830194912454, "grad_norm": 0.20129217207431793, "learning_rate": 3.967505666178556e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207247, "epoch": 4.393939393939394, "step": 870}, {"loss": 0.006438987702131272, "token_acc": 0.9970992071166118, "grad_norm": 0.15680421888828278, "learning_rate": 3.647765913093132e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207512, "epoch": 4.41919191919192, "step": 875}, {"loss": 0.01718273162841797, "token_acc": 0.9943947886683836, "grad_norm": 0.12446325272321701, "learning_rate": 3.340969447919873e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207596, "epoch": 4.444444444444445, "step": 880}, {"eval_loss": 0.3302783668041229, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.8139, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "epoch": 4.444444444444445, "step": 880}, {"loss": 0.0076537981629371645, "token_acc": 0.9654152738493186, "grad_norm": 0.21941973268985748, "learning_rate": 3.0472019397761064e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207479, "epoch": 4.46969696969697, "step": 885}, {"loss": 0.013453680276870727, "token_acc": 0.9971350613915416, "grad_norm": 0.4180464744567871, "learning_rate": 2.7665454196040664e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207427, "epoch": 4.494949494949495, "step": 890}, {"loss": 0.022955694794654848, "token_acc": 0.9932196424902404, "grad_norm": 0.36694836616516113, "learning_rate": 2.4990782572647975e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207683, "epoch": 4.52020202020202, "step": 895}, {"loss": 0.03834371864795685, "token_acc": 0.9876448720752241, "grad_norm": 0.3065175712108612, "learning_rate": 2.2448751396543787e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207506, "epoch": 4.545454545454545, "step": 900}, {"eval_loss": 0.33111608028411865, "eval_token_acc": 0.7402597402597403, "eval_runtime": 1.8047, "eval_samples_per_second": 2.216, "eval_steps_per_second": 2.216, "epoch": 4.545454545454545, "step": 900}, {"loss": 0.0027513707056641577, "token_acc": 0.9582614153459909, "grad_norm": 0.06433413922786713, "learning_rate": 2.004007049848461e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207542, "epoch": 4.570707070707071, "step": 905}, {"loss": 0.007540231198072433, "token_acc": 0.9984427718660783, "grad_norm": 0.08144180476665497, "learning_rate": 1.7765412472811771e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207932, "epoch": 4.595959595959596, "step": 910}, {"loss": 0.01742658168077469, "token_acc": 0.9934481182386103, "grad_norm": 0.014387480914592743, "learning_rate": 1.5625412489637337e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208031, "epoch": 4.621212121212121, "step": 915}, {"loss": 0.005430782586336136, "token_acc": 0.9978392394122731, "grad_norm": 0.29279810190200806, "learning_rate": 1.3620668117481472e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208309, "epoch": 4.646464646464646, "step": 920}, {"eval_loss": 0.3334895372390747, "eval_token_acc": 0.7372627372627373, "eval_runtime": 1.796, "eval_samples_per_second": 2.227, "eval_steps_per_second": 2.227, "epoch": 4.646464646464646, "step": 920}, {"loss": 0.002384462393820286, "token_acc": 0.9558604728054224, "grad_norm": 0.14231210947036743, "learning_rate": 1.1751739156407649e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208319, "epoch": 4.671717171717171, "step": 925}, {"loss": 0.014989945292472839, "token_acc": 0.9928400954653938, "grad_norm": 0.0057262247428298, "learning_rate": 1.0019147481706625e-06, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208549, "epoch": 4.696969696969697, "step": 930}, {"loss": 0.02435753494501114, "token_acc": 0.9895142941932931, "grad_norm": 0.4201620817184448, "learning_rate": 8.423376898168245e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208233, "epoch": 4.722222222222222, "step": 935}, {"loss": 0.015529252588748932, "token_acc": 0.9936803592216863, "grad_norm": 0.15343283116817474, "learning_rate": 6.964873004985717e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208371, "epoch": 4.747474747474747, "step": 940}, {"eval_loss": 0.3332846760749817, "eval_token_acc": 0.7412587412587412, "eval_runtime": 1.8041, "eval_samples_per_second": 2.217, "eval_steps_per_second": 2.217, "epoch": 4.747474747474747, "step": 940}, {"loss": 0.012052442133426666, "token_acc": 0.9724211084592946, "grad_norm": 0.08844652026891708, "learning_rate": 5.644043071326932e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207774, "epoch": 4.7727272727272725, "step": 945}, {"loss": 0.010218892991542817, "token_acc": 0.9942298445263664, "grad_norm": 0.014770667999982834, "learning_rate": 4.461255922609986e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207847, "epoch": 4.797979797979798, "step": 950}, {"loss": 0.003109816461801529, "token_acc": 0.9989392734022806, "grad_norm": 0.14186975359916687, "learning_rate": 3.416841837512952e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207847, "epoch": 4.8232323232323235, "step": 955}, {"loss": 0.027083656191825865, "token_acc": 0.9918020343100046, "grad_norm": 0.3466111421585083, "learning_rate": 2.511092455747932e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207931, "epoch": 4.848484848484849, "step": 960}, {"eval_loss": 0.33344024419784546, "eval_token_acc": 0.7412587412587412, "eval_runtime": 1.7787, "eval_samples_per_second": 2.249, "eval_steps_per_second": 2.249, "epoch": 4.848484848484849, "step": 960}, {"loss": 0.0134581059217453, "token_acc": 0.9534619750283768, "grad_norm": 0.18189793825149536, "learning_rate": 1.7442606966242004e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.207945, "epoch": 4.873737373737374, "step": 965}, {"loss": 0.008723243325948715, "token_acc": 0.9973284354650191, "grad_norm": 0.23113778233528137, "learning_rate": 1.1165606884234181e-07, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.20813, "epoch": 4.898989898989899, "step": 970}, {"loss": 0.005283674597740174, "token_acc": 0.9985783915515841, "grad_norm": 0.0857037901878357, "learning_rate": 6.281677086071303e-08, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208414, "epoch": 4.924242424242424, "step": 975}, {"loss": 0.035409435629844666, "token_acc": 0.9834126862233143, "grad_norm": 0.053703807294368744, "learning_rate": 2.792181348726941e-08, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208454, "epoch": 4.94949494949495, "step": 980}, {"eval_loss": 0.33373889327049255, "eval_token_acc": 0.7372627372627373, "eval_runtime": 1.7669, "eval_samples_per_second": 2.264, "eval_steps_per_second": 2.264, "epoch": 4.94949494949495, "step": 980}, {"loss": 0.06211912035942078, "token_acc": 0.9500478265490487, "grad_norm": 0.2665289044380188, "learning_rate": 6.980940707146389e-09, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208152, "epoch": 4.974747474747475, "step": 985}, {"loss": 0.003582773730158806, "token_acc": 0.9995134609146935, "grad_norm": 0.14736372232437134, "learning_rate": 0.0, "memory(GiB)": 194.45, "train_speed(iter/s)": 0.208263, "epoch": 5.0, "step": 990}, {"eval_loss": 0.3324023485183716, "eval_token_acc": 0.7432567432567433, "eval_runtime": 1.7947, "eval_samples_per_second": 2.229, "eval_steps_per_second": 2.229, "epoch": 5.0, "step": 990}, {"train_runtime": 4757.6516, "train_samples_per_second": 0.416, "train_steps_per_second": 0.208, "total_flos": 6.310258299632026e+17, "train_loss": 0.19778904201312347, "epoch": 5.0, "step": 990}], "memory": 194.44921875} diff --git a/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs/events.out.tfevents.1737752544.kml-task-547024-record-9965643-prod-worker-0.101822.0 b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs/events.out.tfevents.1737752544.kml-task-547024-record-9965643-prod-worker-0.101822.0 new file mode 100644 index 0000000000000000000000000000000000000000..d43ac34d44dcc0dc04d76fadb85133ed20ce6abf --- /dev/null +++ b/output_deepseek_sft/deepseek-r1-70b_400_0.5_sft_4200_rank16_epoch5_what/v0-20250124-210025/runs/events.out.tfevents.1737752544.kml-task-547024-record-9965643-prod-worker-0.101822.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e2ed7335ed0d2b5fbf2b3da7dd67e20802814a02a961cc86fc18c925ab541a +size 99059